1 /*
   2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef _WINDOWS
  26 #include "alloca.h"
  27 #endif
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "code/compiledIC.hpp"
  31 #include "code/debugInfoRec.hpp"
  32 #include "code/nativeInst.hpp"
  33 #include "code/vtableStubs.hpp"
  34 #include "compiler/oopMap.hpp"
  35 #include "gc/shared/collectedHeap.hpp"
  36 #include "gc/shared/gcLocker.hpp"
  37 #include "gc/shared/barrierSet.hpp"
  38 #include "gc/shared/barrierSetAssembler.hpp"
  39 #include "interpreter/interpreter.hpp"
  40 #include "logging/log.hpp"
  41 #include "memory/resourceArea.hpp"
  42 #include "memory/universe.hpp"
  43 #include "oops/klass.inline.hpp"
  44 #include "oops/method.inline.hpp"
  45 #include "prims/methodHandles.hpp"
  46 #include "runtime/continuation.hpp"
  47 #include "runtime/continuationEntry.inline.hpp"
  48 #include "runtime/globals.hpp"
  49 #include "runtime/jniHandles.hpp"
  50 #include "runtime/safepointMechanism.hpp"
  51 #include "runtime/sharedRuntime.hpp"
  52 #include "runtime/signature.hpp"
  53 #include "runtime/stubRoutines.hpp"
  54 #include "runtime/timerTrace.hpp"
  55 #include "runtime/vframeArray.hpp"
  56 #include "runtime/vm_version.hpp"
  57 #include "utilities/align.hpp"
  58 #include "utilities/checkedCast.hpp"
  59 #include "utilities/formatBuffer.hpp"
  60 #include "vmreg_x86.inline.hpp"
  61 #ifdef COMPILER1
  62 #include "c1/c1_Runtime1.hpp"
  63 #endif
  64 #ifdef COMPILER2
  65 #include "opto/runtime.hpp"
  66 #endif
  67 #if INCLUDE_JVMCI
  68 #include "jvmci/jvmciJavaClasses.hpp"
  69 #endif
  70 
  71 #define __ masm->
  72 
  73 #ifdef PRODUCT
  74 #define BLOCK_COMMENT(str) /* nothing */
  75 #else
  76 #define BLOCK_COMMENT(str) __ block_comment(str)
  77 #endif // PRODUCT
  78 
  79 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  80 
  81 class RegisterSaver {
  82   // Capture info about frame layout.  Layout offsets are in jint
  83   // units because compiler frame slots are jints.
  84 #define XSAVE_AREA_BEGIN 160
  85 #define XSAVE_AREA_YMM_BEGIN 576
  86 #define XSAVE_AREA_EGPRS 960
  87 #define XSAVE_AREA_OPMASK_BEGIN 1088
  88 #define XSAVE_AREA_ZMM_BEGIN 1152
  89 #define XSAVE_AREA_UPPERBANK 1664
  90 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  91 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  92 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
  93 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
  94 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
  95   enum layout {
  96     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
  97     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
  98     DEF_XMM_OFFS(0),
  99     DEF_XMM_OFFS(1),
 100     // 2..15 are implied in range usage
 101     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 102     DEF_YMM_OFFS(0),
 103     DEF_YMM_OFFS(1),
 104     r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 105     r16H_off,
 106     r17_off, r17H_off,
 107     r18_off, r18H_off,
 108     r19_off, r19H_off,
 109     r20_off, r20H_off,
 110     r21_off, r21H_off,
 111     r22_off, r22H_off,
 112     r23_off, r23H_off,
 113     r24_off, r24H_off,
 114     r25_off, r25H_off,
 115     r26_off, r26H_off,
 116     r27_off, r27H_off,
 117     r28_off, r28H_off,
 118     r29_off, r29H_off,
 119     r30_off, r30H_off,
 120     r31_off, r31H_off,
 121     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 122     DEF_OPMASK_OFFS(0),
 123     DEF_OPMASK_OFFS(1),
 124     // 2..7 are implied in range usage
 125     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 126     DEF_ZMM_OFFS(0),
 127     DEF_ZMM_OFFS(1),
 128     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 129     DEF_ZMM_UPPER_OFFS(16),
 130     DEF_ZMM_UPPER_OFFS(17),
 131     // 18..31 are implied in range usage
 132     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 133     fpu_stateH_end,
 134     r15_off, r15H_off,
 135     r14_off, r14H_off,
 136     r13_off, r13H_off,
 137     r12_off, r12H_off,
 138     r11_off, r11H_off,
 139     r10_off, r10H_off,
 140     r9_off,  r9H_off,
 141     r8_off,  r8H_off,
 142     rdi_off, rdiH_off,
 143     rsi_off, rsiH_off,
 144     ignore_off, ignoreH_off,  // extra copy of rbp
 145     rsp_off, rspH_off,
 146     rbx_off, rbxH_off,
 147     rdx_off, rdxH_off,
 148     rcx_off, rcxH_off,
 149     rax_off, raxH_off,
 150     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 151     align_off, alignH_off,
 152     flags_off, flagsH_off,
 153     // The frame sender code expects that rbp will be in the "natural" place and
 154     // will override any oopMap setting for it. We must therefore force the layout
 155     // so that it agrees with the frame sender code.
 156     rbp_off, rbpH_off,        // copy of rbp we will restore
 157     return_off, returnH_off,  // slot for return address
 158     reg_save_size             // size in compiler stack slots
 159   };
 160 
 161  public:
 162   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 163   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 164 
 165   // Offsets into the register save area
 166   // Used by deoptimization when it is managing result register
 167   // values on its own
 168 
 169   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 170   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 171   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 172   static int r15_offset_in_bytes(void)    { return BytesPerInt * r15_off; }
 173   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 174   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 175 
 176   // During deoptimization only the result registers need to be restored,
 177   // all the other values have already been extracted.
 178   static void restore_result_registers(MacroAssembler* masm);
 179 };
 180 
 181 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 182   int off = 0;
 183   int num_xmm_regs = XMMRegister::available_xmm_registers();
 184 #if COMPILER2_OR_JVMCI
 185   if (save_wide_vectors && UseAVX == 0) {
 186     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 187   }
 188   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 189 #else
 190   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 191 #endif
 192 
 193   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 194   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 195   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 196   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 197   // CodeBlob frame size is in words.
 198   int frame_size_in_words = frame_size_in_bytes / wordSize;
 199   *total_frame_words = frame_size_in_words;
 200 
 201   // Save registers, fpu state, and flags.
 202   // We assume caller has already pushed the return address onto the
 203   // stack, so rsp is 8-byte aligned here.
 204   // We push rpb twice in this sequence because we want the real rbp
 205   // to be under the return like a normal enter.
 206 
 207   __ enter();          // rsp becomes 16-byte aligned here
 208   __ pushf();
 209   // Make sure rsp stays 16-byte aligned
 210   __ subq(rsp, 8);
 211   // Push CPU state in multiple of 16 bytes
 212   __ save_legacy_gprs();
 213   __ push_FPU_state();
 214 
 215 
 216   // push cpu state handles this on EVEX enabled targets
 217   if (save_wide_vectors) {
 218     // Save upper half of YMM registers(0..15)
 219     int base_addr = XSAVE_AREA_YMM_BEGIN;
 220     for (int n = 0; n < 16; n++) {
 221       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 222     }
 223     if (VM_Version::supports_evex()) {
 224       // Save upper half of ZMM registers(0..15)
 225       base_addr = XSAVE_AREA_ZMM_BEGIN;
 226       for (int n = 0; n < 16; n++) {
 227         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 228       }
 229       // Save full ZMM registers(16..num_xmm_regs)
 230       base_addr = XSAVE_AREA_UPPERBANK;
 231       off = 0;
 232       int vector_len = Assembler::AVX_512bit;
 233       for (int n = 16; n < num_xmm_regs; n++) {
 234         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 235       }
 236 #if COMPILER2_OR_JVMCI
 237       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 238       off = 0;
 239       for(int n = 0; n < KRegister::number_of_registers; n++) {
 240         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 241       }
 242 #endif
 243     }
 244   } else {
 245     if (VM_Version::supports_evex()) {
 246       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 247       int base_addr = XSAVE_AREA_UPPERBANK;
 248       off = 0;
 249       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 250       for (int n = 16; n < num_xmm_regs; n++) {
 251         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 252       }
 253 #if COMPILER2_OR_JVMCI
 254       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 255       off = 0;
 256       for(int n = 0; n < KRegister::number_of_registers; n++) {
 257         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 258       }
 259 #endif
 260     }
 261   }
 262 
 263 #if COMPILER2_OR_JVMCI
 264   if (UseAPX) {
 265       int base_addr = XSAVE_AREA_EGPRS;
 266       off = 0;
 267       for (int n = 16; n < Register::number_of_registers; n++) {
 268         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 269       }
 270   }
 271 #endif
 272 
 273   __ vzeroupper();
 274   if (frame::arg_reg_save_area_bytes != 0) {
 275     // Allocate argument register save area
 276     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 277   }
 278 
 279   // Set an oopmap for the call site.  This oopmap will map all
 280   // oop-registers and debug-info registers as callee-saved.  This
 281   // will allow deoptimization at this safepoint to find all possible
 282   // debug-info recordings, as well as let GC find all oops.
 283 
 284   OopMapSet *oop_maps = new OopMapSet();
 285   OopMap* map = new OopMap(frame_size_in_slots, 0);
 286 
 287 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 288 
 289   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 290   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 291   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 292   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 293   // rbp location is known implicitly by the frame sender code, needs no oopmap
 294   // and the location where rbp was saved by is ignored
 295   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 296   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 297   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 298   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 299   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 300   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 301   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 302   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 303   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 304   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 305 
 306   if (UseAPX) {
 307     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 308     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 309     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 310     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 311     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 312     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 313     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 314     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 315     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 316     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 317     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 318     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 319     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 320     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 321     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 322     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 323   }
 324   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 325   // on EVEX enabled targets, we get it included in the xsave area
 326   off = xmm0_off;
 327   int delta = xmm1_off - off;
 328   for (int n = 0; n < 16; n++) {
 329     XMMRegister xmm_name = as_XMMRegister(n);
 330     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 331     off += delta;
 332   }
 333   if (UseAVX > 2) {
 334     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 335     off = zmm16_off;
 336     delta = zmm17_off - off;
 337     for (int n = 16; n < num_xmm_regs; n++) {
 338       XMMRegister zmm_name = as_XMMRegister(n);
 339       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 340       off += delta;
 341     }
 342   }
 343 
 344 #if COMPILER2_OR_JVMCI
 345   if (save_wide_vectors) {
 346     // Save upper half of YMM registers(0..15)
 347     off = ymm0_off;
 348     delta = ymm1_off - ymm0_off;
 349     for (int n = 0; n < 16; n++) {
 350       XMMRegister ymm_name = as_XMMRegister(n);
 351       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 352       off += delta;
 353     }
 354     if (VM_Version::supports_evex()) {
 355       // Save upper half of ZMM registers(0..15)
 356       off = zmm0_off;
 357       delta = zmm1_off - zmm0_off;
 358       for (int n = 0; n < 16; n++) {
 359         XMMRegister zmm_name = as_XMMRegister(n);
 360         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 361         off += delta;
 362       }
 363     }
 364   }
 365 #endif // COMPILER2_OR_JVMCI
 366 
 367   // %%% These should all be a waste but we'll keep things as they were for now
 368   if (true) {
 369     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 370     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 371     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 372     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 373     // rbp location is known implicitly by the frame sender code, needs no oopmap
 374     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 375     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 376     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 377     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 378     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 379     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 380     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 381     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 382     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 383     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 384     if (UseAPX) {
 385       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 386       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 387       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 388       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 389       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 390       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 391       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 392       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 393       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 394       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 395       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 396       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 397       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 398       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 399       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 400       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 401     }
 402     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 403     // on EVEX enabled targets, we get it included in the xsave area
 404     off = xmm0H_off;
 405     delta = xmm1H_off - off;
 406     for (int n = 0; n < 16; n++) {
 407       XMMRegister xmm_name = as_XMMRegister(n);
 408       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 409       off += delta;
 410     }
 411     if (UseAVX > 2) {
 412       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 413       off = zmm16H_off;
 414       delta = zmm17H_off - off;
 415       for (int n = 16; n < num_xmm_regs; n++) {
 416         XMMRegister zmm_name = as_XMMRegister(n);
 417         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 418         off += delta;
 419       }
 420     }
 421   }
 422 
 423   return map;
 424 }
 425 
 426 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 427   int num_xmm_regs = XMMRegister::available_xmm_registers();
 428   if (frame::arg_reg_save_area_bytes != 0) {
 429     // Pop arg register save area
 430     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 431   }
 432 
 433 #if COMPILER2_OR_JVMCI
 434   if (restore_wide_vectors) {
 435     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 436     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 437   }
 438 #else
 439   assert(!restore_wide_vectors, "vectors are generated only by C2");
 440 #endif
 441 
 442   __ vzeroupper();
 443 
 444   // On EVEX enabled targets everything is handled in pop fpu state
 445   if (restore_wide_vectors) {
 446     // Restore upper half of YMM registers (0..15)
 447     int base_addr = XSAVE_AREA_YMM_BEGIN;
 448     for (int n = 0; n < 16; n++) {
 449       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 450     }
 451     if (VM_Version::supports_evex()) {
 452       // Restore upper half of ZMM registers (0..15)
 453       base_addr = XSAVE_AREA_ZMM_BEGIN;
 454       for (int n = 0; n < 16; n++) {
 455         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 456       }
 457       // Restore full ZMM registers(16..num_xmm_regs)
 458       base_addr = XSAVE_AREA_UPPERBANK;
 459       int vector_len = Assembler::AVX_512bit;
 460       int off = 0;
 461       for (int n = 16; n < num_xmm_regs; n++) {
 462         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 463       }
 464 #if COMPILER2_OR_JVMCI
 465       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 466       off = 0;
 467       for (int n = 0; n < KRegister::number_of_registers; n++) {
 468         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 469       }
 470 #endif
 471     }
 472   } else {
 473     if (VM_Version::supports_evex()) {
 474       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 475       int base_addr = XSAVE_AREA_UPPERBANK;
 476       int off = 0;
 477       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 478       for (int n = 16; n < num_xmm_regs; n++) {
 479         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 480       }
 481 #if COMPILER2_OR_JVMCI
 482       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 483       off = 0;
 484       for (int n = 0; n < KRegister::number_of_registers; n++) {
 485         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 486       }
 487 #endif
 488     }
 489   }
 490 
 491 #if COMPILER2_OR_JVMCI
 492   if (UseAPX) {
 493     int base_addr = XSAVE_AREA_EGPRS;
 494     int off = 0;
 495     for (int n = 16; n < Register::number_of_registers; n++) {
 496       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 497     }
 498   }
 499 #endif
 500 
 501   // Recover CPU state
 502   __ pop_FPU_state();
 503   __ restore_legacy_gprs();
 504   __ addq(rsp, 8);
 505   __ popf();
 506   // Get the rbp described implicitly by the calling convention (no oopMap)
 507   __ pop(rbp);
 508 }
 509 
 510 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 511 
 512   // Just restore result register. Only used by deoptimization. By
 513   // now any callee save register that needs to be restored to a c2
 514   // caller of the deoptee has been extracted into the vframeArray
 515   // and will be stuffed into the c2i adapter we create for later
 516   // restoration so only result registers need to be restored here.
 517 
 518   // Restore fp result register
 519   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 520   // Restore integer result register
 521   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 522   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 523 
 524   // Pop all of the register save are off the stack except the return address
 525   __ addptr(rsp, return_offset_in_bytes());
 526 }
 527 
 528 // Is vector's size (in bytes) bigger than a size saved by default?
 529 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 530 bool SharedRuntime::is_wide_vector(int size) {
 531   return size > 16;
 532 }
 533 
 534 // ---------------------------------------------------------------------------
 535 // Read the array of BasicTypes from a signature, and compute where the
 536 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 537 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 538 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 539 // as framesizes are fixed.
 540 // VMRegImpl::stack0 refers to the first slot 0(sp).
 541 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 542 // Register up to Register::number_of_registers are the 64-bit
 543 // integer registers.
 544 
 545 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 546 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 547 // units regardless of build. Of course for i486 there is no 64 bit build
 548 
 549 // The Java calling convention is a "shifted" version of the C ABI.
 550 // By skipping the first C ABI register we can call non-static jni methods
 551 // with small numbers of arguments without having to shuffle the arguments
 552 // at all. Since we control the java ABI we ought to at least get some
 553 // advantage out of it.
 554 
 555 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 556                                            VMRegPair *regs,
 557                                            int total_args_passed) {
 558 
 559   // Create the mapping between argument positions and
 560   // registers.
 561   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 562     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 563   };
 564   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 565     j_farg0, j_farg1, j_farg2, j_farg3,
 566     j_farg4, j_farg5, j_farg6, j_farg7
 567   };
 568 
 569 
 570   uint int_args = 0;
 571   uint fp_args = 0;
 572   uint stk_args = 0;
 573 
 574   for (int i = 0; i < total_args_passed; i++) {
 575     switch (sig_bt[i]) {
 576     case T_BOOLEAN:
 577     case T_CHAR:
 578     case T_BYTE:
 579     case T_SHORT:
 580     case T_INT:
 581       if (int_args < Argument::n_int_register_parameters_j) {
 582         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 583       } else {
 584         stk_args = align_up(stk_args, 2);
 585         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 586         stk_args += 1;
 587       }
 588       break;
 589     case T_VOID:
 590       // halves of T_LONG or T_DOUBLE
 591       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 592       regs[i].set_bad();
 593       break;
 594     case T_LONG:
 595       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 596       // fall through
 597     case T_OBJECT:
 598     case T_ARRAY:
 599     case T_ADDRESS:
 600       if (int_args < Argument::n_int_register_parameters_j) {
 601         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 602       } else {
 603         stk_args = align_up(stk_args, 2);
 604         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 605         stk_args += 2;
 606       }
 607       break;
 608     case T_FLOAT:
 609       if (fp_args < Argument::n_float_register_parameters_j) {
 610         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 611       } else {
 612         stk_args = align_up(stk_args, 2);
 613         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 614         stk_args += 1;
 615       }
 616       break;
 617     case T_DOUBLE:
 618       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 619       if (fp_args < Argument::n_float_register_parameters_j) {
 620         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 621       } else {
 622         stk_args = align_up(stk_args, 2);
 623         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 624         stk_args += 2;
 625       }
 626       break;
 627     default:
 628       ShouldNotReachHere();
 629       break;
 630     }
 631   }
 632 
 633   return stk_args;
 634 }
 635 
 636 // Patch the callers callsite with entry to compiled code if it exists.
 637 static void patch_callers_callsite(MacroAssembler *masm) {
 638   Label L;
 639   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 640   __ jcc(Assembler::equal, L);
 641 
 642   // Save the current stack pointer
 643   __ mov(r13, rsp);
 644   // Schedule the branch target address early.
 645   // Call into the VM to patch the caller, then jump to compiled callee
 646   // rax isn't live so capture return address while we easily can
 647   __ movptr(rax, Address(rsp, 0));
 648 
 649   // align stack so push_CPU_state doesn't fault
 650   __ andptr(rsp, -(StackAlignmentInBytes));
 651   __ push_CPU_state();
 652   __ vzeroupper();
 653   // VM needs caller's callsite
 654   // VM needs target method
 655   // This needs to be a long call since we will relocate this adapter to
 656   // the codeBuffer and it may not reach
 657 
 658   // Allocate argument register save area
 659   if (frame::arg_reg_save_area_bytes != 0) {
 660     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 661   }
 662   __ mov(c_rarg0, rbx);
 663   __ mov(c_rarg1, rax);
 664   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 665 
 666   // De-allocate argument register save area
 667   if (frame::arg_reg_save_area_bytes != 0) {
 668     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 669   }
 670 
 671   __ vzeroupper();
 672   __ pop_CPU_state();
 673   // restore sp
 674   __ mov(rsp, r13);
 675   __ bind(L);
 676 }
 677 
 678 static void gen_c2i_adapter(MacroAssembler *masm,
 679                             int total_args_passed,
 680                             int comp_args_on_stack,
 681                             const BasicType *sig_bt,
 682                             const VMRegPair *regs,
 683                             Label& skip_fixup) {
 684   // Before we get into the guts of the C2I adapter, see if we should be here
 685   // at all.  We've come from compiled code and are attempting to jump to the
 686   // interpreter, which means the caller made a static call to get here
 687   // (vcalls always get a compiled target if there is one).  Check for a
 688   // compiled target.  If there is one, we need to patch the caller's call.
 689   patch_callers_callsite(masm);
 690 
 691   __ bind(skip_fixup);
 692 
 693   // Since all args are passed on the stack, total_args_passed *
 694   // Interpreter::stackElementSize is the space we need.
 695 
 696   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 697 
 698   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 699 
 700   // stack is aligned, keep it that way
 701   // This is not currently needed or enforced by the interpreter, but
 702   // we might as well conform to the ABI.
 703   extraspace = align_up(extraspace, 2*wordSize);
 704 
 705   // set senderSP value
 706   __ lea(r13, Address(rsp, wordSize));
 707 
 708 #ifdef ASSERT
 709   __ check_stack_alignment(r13, "sender stack not aligned");
 710 #endif
 711   if (extraspace > 0) {
 712     // Pop the return address
 713     __ pop(rax);
 714 
 715     __ subptr(rsp, extraspace);
 716 
 717     // Push the return address
 718     __ push(rax);
 719 
 720     // Account for the return address location since we store it first rather
 721     // than hold it in a register across all the shuffling
 722     extraspace += wordSize;
 723   }
 724 
 725 #ifdef ASSERT
 726   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 727 #endif
 728 
 729   // Now write the args into the outgoing interpreter space
 730   for (int i = 0; i < total_args_passed; i++) {
 731     if (sig_bt[i] == T_VOID) {
 732       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 733       continue;
 734     }
 735 
 736     // offset to start parameters
 737     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 738     int next_off = st_off - Interpreter::stackElementSize;
 739 
 740     // Say 4 args:
 741     // i   st_off
 742     // 0   32 T_LONG
 743     // 1   24 T_VOID
 744     // 2   16 T_OBJECT
 745     // 3    8 T_BOOL
 746     // -    0 return address
 747     //
 748     // However to make thing extra confusing. Because we can fit a long/double in
 749     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 750     // leaves one slot empty and only stores to a single slot. In this case the
 751     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 752 
 753     VMReg r_1 = regs[i].first();
 754     VMReg r_2 = regs[i].second();
 755     if (!r_1->is_valid()) {
 756       assert(!r_2->is_valid(), "");
 757       continue;
 758     }
 759     if (r_1->is_stack()) {
 760       // memory to memory use rax
 761       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 762       if (!r_2->is_valid()) {
 763         // sign extend??
 764         __ movl(rax, Address(rsp, ld_off));
 765         __ movptr(Address(rsp, st_off), rax);
 766 
 767       } else {
 768 
 769         __ movq(rax, Address(rsp, ld_off));
 770 
 771         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 772         // T_DOUBLE and T_LONG use two slots in the interpreter
 773         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 774           // ld_off == LSW, ld_off+wordSize == MSW
 775           // st_off == MSW, next_off == LSW
 776           __ movq(Address(rsp, next_off), rax);
 777 #ifdef ASSERT
 778           // Overwrite the unused slot with known junk
 779           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 780           __ movptr(Address(rsp, st_off), rax);
 781 #endif /* ASSERT */
 782         } else {
 783           __ movq(Address(rsp, st_off), rax);
 784         }
 785       }
 786     } else if (r_1->is_Register()) {
 787       Register r = r_1->as_Register();
 788       if (!r_2->is_valid()) {
 789         // must be only an int (or less ) so move only 32bits to slot
 790         // why not sign extend??
 791         __ movl(Address(rsp, st_off), r);
 792       } else {
 793         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 794         // T_DOUBLE and T_LONG use two slots in the interpreter
 795         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 796           // long/double in gpr
 797 #ifdef ASSERT
 798           // Overwrite the unused slot with known junk
 799           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 800           __ movptr(Address(rsp, st_off), rax);
 801 #endif /* ASSERT */
 802           __ movq(Address(rsp, next_off), r);
 803         } else {
 804           __ movptr(Address(rsp, st_off), r);
 805         }
 806       }
 807     } else {
 808       assert(r_1->is_XMMRegister(), "");
 809       if (!r_2->is_valid()) {
 810         // only a float use just part of the slot
 811         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 812       } else {
 813 #ifdef ASSERT
 814         // Overwrite the unused slot with known junk
 815         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 816         __ movptr(Address(rsp, st_off), rax);
 817 #endif /* ASSERT */
 818         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 819       }
 820     }
 821   }
 822 
 823   // Schedule the branch target address early.
 824   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 825   __ jmp(rcx);
 826 }
 827 
 828 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 829                                     int total_args_passed,
 830                                     int comp_args_on_stack,
 831                                     const BasicType *sig_bt,
 832                                     const VMRegPair *regs) {
 833 
 834   // Note: r13 contains the senderSP on entry. We must preserve it since
 835   // we may do a i2c -> c2i transition if we lose a race where compiled
 836   // code goes non-entrant while we get args ready.
 837   // In addition we use r13 to locate all the interpreter args as
 838   // we must align the stack to 16 bytes on an i2c entry else we
 839   // lose alignment we expect in all compiled code and register
 840   // save code can segv when fxsave instructions find improperly
 841   // aligned stack pointer.
 842 
 843   // Adapters can be frameless because they do not require the caller
 844   // to perform additional cleanup work, such as correcting the stack pointer.
 845   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 846   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 847   // even if a callee has modified the stack pointer.
 848   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 849   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 850   // up via the senderSP register).
 851   // In other words, if *either* the caller or callee is interpreted, we can
 852   // get the stack pointer repaired after a call.
 853   // This is why c2i and i2c adapters cannot be indefinitely composed.
 854   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 855   // both caller and callee would be compiled methods, and neither would
 856   // clean up the stack pointer changes performed by the two adapters.
 857   // If this happens, control eventually transfers back to the compiled
 858   // caller, but with an uncorrected stack, causing delayed havoc.
 859 
 860   // Must preserve original SP for loading incoming arguments because
 861   // we need to align the outgoing SP for compiled code.
 862   __ movptr(r11, rsp);
 863 
 864   // Pick up the return address
 865   __ pop(rax);
 866 
 867   // Convert 4-byte c2 stack slots to words.
 868   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 869 
 870   if (comp_args_on_stack) {
 871     __ subptr(rsp, comp_words_on_stack * wordSize);
 872   }
 873 
 874   // Ensure compiled code always sees stack at proper alignment
 875   __ andptr(rsp, -16);
 876 
 877   // push the return address and misalign the stack that youngest frame always sees
 878   // as far as the placement of the call instruction
 879   __ push(rax);
 880 
 881   // Put saved SP in another register
 882   const Register saved_sp = rax;
 883   __ movptr(saved_sp, r11);
 884 
 885   // Will jump to the compiled code just as if compiled code was doing it.
 886   // Pre-load the register-jump target early, to schedule it better.
 887   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 888 
 889 #if INCLUDE_JVMCI
 890   if (EnableJVMCI) {
 891     // check if this call should be routed towards a specific entry point
 892     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 893     Label no_alternative_target;
 894     __ jcc(Assembler::equal, no_alternative_target);
 895     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 896     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 897     __ bind(no_alternative_target);
 898   }
 899 #endif // INCLUDE_JVMCI
 900 
 901   // Now generate the shuffle code.  Pick up all register args and move the
 902   // rest through the floating point stack top.
 903   for (int i = 0; i < total_args_passed; i++) {
 904     if (sig_bt[i] == T_VOID) {
 905       // Longs and doubles are passed in native word order, but misaligned
 906       // in the 32-bit build.
 907       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 908       continue;
 909     }
 910 
 911     // Pick up 0, 1 or 2 words from SP+offset.
 912 
 913     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 914             "scrambled load targets?");
 915     // Load in argument order going down.
 916     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 917     // Point to interpreter value (vs. tag)
 918     int next_off = ld_off - Interpreter::stackElementSize;
 919     //
 920     //
 921     //
 922     VMReg r_1 = regs[i].first();
 923     VMReg r_2 = regs[i].second();
 924     if (!r_1->is_valid()) {
 925       assert(!r_2->is_valid(), "");
 926       continue;
 927     }
 928     if (r_1->is_stack()) {
 929       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 930       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 931 
 932       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 933       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 934       // will be generated.
 935       if (!r_2->is_valid()) {
 936         // sign extend???
 937         __ movl(r13, Address(saved_sp, ld_off));
 938         __ movptr(Address(rsp, st_off), r13);
 939       } else {
 940         //
 941         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 942         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 943         // So we must adjust where to pick up the data to match the interpreter.
 944         //
 945         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 946         // are accessed as negative so LSW is at LOW address
 947 
 948         // ld_off is MSW so get LSW
 949         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 950                            next_off : ld_off;
 951         __ movq(r13, Address(saved_sp, offset));
 952         // st_off is LSW (i.e. reg.first())
 953         __ movq(Address(rsp, st_off), r13);
 954       }
 955     } else if (r_1->is_Register()) {  // Register argument
 956       Register r = r_1->as_Register();
 957       assert(r != rax, "must be different");
 958       if (r_2->is_valid()) {
 959         //
 960         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 961         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 962         // So we must adjust where to pick up the data to match the interpreter.
 963 
 964         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 965                            next_off : ld_off;
 966 
 967         // this can be a misaligned move
 968         __ movq(r, Address(saved_sp, offset));
 969       } else {
 970         // sign extend and use a full word?
 971         __ movl(r, Address(saved_sp, ld_off));
 972       }
 973     } else {
 974       if (!r_2->is_valid()) {
 975         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 976       } else {
 977         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 978       }
 979     }
 980   }
 981 
 982   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
 983 
 984   // 6243940 We might end up in handle_wrong_method if
 985   // the callee is deoptimized as we race thru here. If that
 986   // happens we don't want to take a safepoint because the
 987   // caller frame will look interpreted and arguments are now
 988   // "compiled" so it is much better to make this transition
 989   // invisible to the stack walking code. Unfortunately if
 990   // we try and find the callee by normal means a safepoint
 991   // is possible. So we stash the desired callee in the thread
 992   // and the vm will find there should this case occur.
 993 
 994   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 995 
 996   // put Method* where a c2i would expect should we end up there
 997   // only needed because eof c2 resolve stubs return Method* as a result in
 998   // rax
 999   __ mov(rax, rbx);
1000   __ jmp(r11);
1001 }
1002 
1003 // ---------------------------------------------------------------
1004 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
1005                                             int total_args_passed,
1006                                             int comp_args_on_stack,
1007                                             const BasicType *sig_bt,
1008                                             const VMRegPair *regs,
1009                                             AdapterHandlerEntry* handler) {
1010   address i2c_entry = __ pc();
1011 
1012   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
1013 
1014   // -------------------------------------------------------------------------
1015   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1016   // to the interpreter.  The args start out packed in the compiled layout.  They
1017   // need to be unpacked into the interpreter layout.  This will almost always
1018   // require some stack space.  We grow the current (compiled) stack, then repack
1019   // the args.  We  finally end in a jump to the generic interpreter entry point.
1020   // On exit from the interpreter, the interpreter will restore our SP (lest the
1021   // compiled code, which relies solely on SP and not RBP, get sick).
1022 
1023   address c2i_unverified_entry = __ pc();
1024   Label skip_fixup;
1025 
1026   Register data = rax;
1027   Register receiver = j_rarg0;
1028   Register temp = rbx;
1029 
1030   {
1031     __ ic_check(1 /* end_alignment */);
1032     __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1033     // Method might have been compiled since the call site was patched to
1034     // interpreted if that is the case treat it as a miss so we can get
1035     // the call site corrected.
1036     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1037     __ jcc(Assembler::equal, skip_fixup);
1038     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1039   }
1040 
1041   address c2i_entry = __ pc();
1042 
1043   // Class initialization barrier for static methods
1044   address c2i_no_clinit_check_entry = nullptr;
1045   if (VM_Version::supports_fast_class_init_checks()) {
1046     Label L_skip_barrier;
1047     Register method = rbx;
1048 
1049     { // Bypass the barrier for non-static methods
1050       Register flags = rscratch1;
1051       __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
1052       __ testl(flags, JVM_ACC_STATIC);
1053       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1054     }
1055 
1056     Register klass = rscratch1;
1057     __ load_method_holder(klass, method);
1058     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1059 
1060     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1061 
1062     __ bind(L_skip_barrier);
1063     c2i_no_clinit_check_entry = __ pc();
1064   }
1065 
1066   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1067   bs->c2i_entry_barrier(masm);
1068 
1069   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1070 
1071   handler->set_entry_points(i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1072   return;
1073 }
1074 
1075 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1076                                          VMRegPair *regs,
1077                                          int total_args_passed) {
1078 
1079 // We return the amount of VMRegImpl stack slots we need to reserve for all
1080 // the arguments NOT counting out_preserve_stack_slots.
1081 
1082 // NOTE: These arrays will have to change when c1 is ported
1083 #ifdef _WIN64
1084     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1085       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1086     };
1087     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1088       c_farg0, c_farg1, c_farg2, c_farg3
1089     };
1090 #else
1091     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1092       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1093     };
1094     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1095       c_farg0, c_farg1, c_farg2, c_farg3,
1096       c_farg4, c_farg5, c_farg6, c_farg7
1097     };
1098 #endif // _WIN64
1099 
1100 
1101     uint int_args = 0;
1102     uint fp_args = 0;
1103     uint stk_args = 0; // inc by 2 each time
1104 
1105     for (int i = 0; i < total_args_passed; i++) {
1106       switch (sig_bt[i]) {
1107       case T_BOOLEAN:
1108       case T_CHAR:
1109       case T_BYTE:
1110       case T_SHORT:
1111       case T_INT:
1112         if (int_args < Argument::n_int_register_parameters_c) {
1113           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1114 #ifdef _WIN64
1115           fp_args++;
1116           // Allocate slots for callee to stuff register args the stack.
1117           stk_args += 2;
1118 #endif
1119         } else {
1120           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1121           stk_args += 2;
1122         }
1123         break;
1124       case T_LONG:
1125         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1126         // fall through
1127       case T_OBJECT:
1128       case T_ARRAY:
1129       case T_ADDRESS:
1130       case T_METADATA:
1131         if (int_args < Argument::n_int_register_parameters_c) {
1132           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1133 #ifdef _WIN64
1134           fp_args++;
1135           stk_args += 2;
1136 #endif
1137         } else {
1138           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1139           stk_args += 2;
1140         }
1141         break;
1142       case T_FLOAT:
1143         if (fp_args < Argument::n_float_register_parameters_c) {
1144           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1145 #ifdef _WIN64
1146           int_args++;
1147           // Allocate slots for callee to stuff register args the stack.
1148           stk_args += 2;
1149 #endif
1150         } else {
1151           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1152           stk_args += 2;
1153         }
1154         break;
1155       case T_DOUBLE:
1156         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1157         if (fp_args < Argument::n_float_register_parameters_c) {
1158           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1159 #ifdef _WIN64
1160           int_args++;
1161           // Allocate slots for callee to stuff register args the stack.
1162           stk_args += 2;
1163 #endif
1164         } else {
1165           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1166           stk_args += 2;
1167         }
1168         break;
1169       case T_VOID: // Halves of longs and doubles
1170         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1171         regs[i].set_bad();
1172         break;
1173       default:
1174         ShouldNotReachHere();
1175         break;
1176       }
1177     }
1178 #ifdef _WIN64
1179   // windows abi requires that we always allocate enough stack space
1180   // for 4 64bit registers to be stored down.
1181   if (stk_args < 8) {
1182     stk_args = 8;
1183   }
1184 #endif // _WIN64
1185 
1186   return stk_args;
1187 }
1188 
1189 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1190                                              uint num_bits,
1191                                              uint total_args_passed) {
1192   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1193          "only certain vector sizes are supported for now");
1194 
1195   static const XMMRegister VEC_ArgReg[32] = {
1196      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1197      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1198     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1199     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1200   };
1201 
1202   uint stk_args = 0;
1203   uint fp_args = 0;
1204 
1205   for (uint i = 0; i < total_args_passed; i++) {
1206     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1207     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1208     regs[i].set_pair(vmreg->next(next_val), vmreg);
1209   }
1210 
1211   return stk_args;
1212 }
1213 
1214 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1215   // We always ignore the frame_slots arg and just use the space just below frame pointer
1216   // which by this time is free to use
1217   switch (ret_type) {
1218   case T_FLOAT:
1219     __ movflt(Address(rbp, -wordSize), xmm0);
1220     break;
1221   case T_DOUBLE:
1222     __ movdbl(Address(rbp, -wordSize), xmm0);
1223     break;
1224   case T_VOID:  break;
1225   default: {
1226     __ movptr(Address(rbp, -wordSize), rax);
1227     }
1228   }
1229 }
1230 
1231 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1232   // We always ignore the frame_slots arg and just use the space just below frame pointer
1233   // which by this time is free to use
1234   switch (ret_type) {
1235   case T_FLOAT:
1236     __ movflt(xmm0, Address(rbp, -wordSize));
1237     break;
1238   case T_DOUBLE:
1239     __ movdbl(xmm0, Address(rbp, -wordSize));
1240     break;
1241   case T_VOID:  break;
1242   default: {
1243     __ movptr(rax, Address(rbp, -wordSize));
1244     }
1245   }
1246 }
1247 
1248 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1249     for ( int i = first_arg ; i < arg_count ; i++ ) {
1250       if (args[i].first()->is_Register()) {
1251         __ push(args[i].first()->as_Register());
1252       } else if (args[i].first()->is_XMMRegister()) {
1253         __ subptr(rsp, 2*wordSize);
1254         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1255       }
1256     }
1257 }
1258 
1259 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1260     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1261       if (args[i].first()->is_Register()) {
1262         __ pop(args[i].first()->as_Register());
1263       } else if (args[i].first()->is_XMMRegister()) {
1264         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1265         __ addptr(rsp, 2*wordSize);
1266       }
1267     }
1268 }
1269 
1270 static void verify_oop_args(MacroAssembler* masm,
1271                             const methodHandle& method,
1272                             const BasicType* sig_bt,
1273                             const VMRegPair* regs) {
1274   Register temp_reg = rbx;  // not part of any compiled calling seq
1275   if (VerifyOops) {
1276     for (int i = 0; i < method->size_of_parameters(); i++) {
1277       if (is_reference_type(sig_bt[i])) {
1278         VMReg r = regs[i].first();
1279         assert(r->is_valid(), "bad oop arg");
1280         if (r->is_stack()) {
1281           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1282           __ verify_oop(temp_reg);
1283         } else {
1284           __ verify_oop(r->as_Register());
1285         }
1286       }
1287     }
1288   }
1289 }
1290 
1291 static void check_continuation_enter_argument(VMReg actual_vmreg,
1292                                               Register expected_reg,
1293                                               const char* name) {
1294   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1295   assert(actual_vmreg->as_Register() == expected_reg,
1296          "%s is in unexpected register: %s instead of %s",
1297          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1298 }
1299 
1300 
1301 //---------------------------- continuation_enter_setup ---------------------------
1302 //
1303 // Arguments:
1304 //   None.
1305 //
1306 // Results:
1307 //   rsp: pointer to blank ContinuationEntry
1308 //
1309 // Kills:
1310 //   rax
1311 //
1312 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1313   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1314   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1315   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1316 
1317   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1318   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1319 
1320   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1321   OopMap* map = new OopMap(frame_size, 0);
1322 
1323   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1324   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1325   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1326 
1327   return map;
1328 }
1329 
1330 //---------------------------- fill_continuation_entry ---------------------------
1331 //
1332 // Arguments:
1333 //   rsp: pointer to blank Continuation entry
1334 //   reg_cont_obj: pointer to the continuation
1335 //   reg_flags: flags
1336 //
1337 // Results:
1338 //   rsp: pointer to filled out ContinuationEntry
1339 //
1340 // Kills:
1341 //   rax
1342 //
1343 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1344   assert_different_registers(rax, reg_cont_obj, reg_flags);
1345 #ifdef ASSERT
1346   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1347 #endif
1348   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1349   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1350   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1351   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1352   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1353 
1354   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1355   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1356   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1357   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1358 
1359   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1360   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1361 }
1362 
1363 //---------------------------- continuation_enter_cleanup ---------------------------
1364 //
1365 // Arguments:
1366 //   rsp: pointer to the ContinuationEntry
1367 //
1368 // Results:
1369 //   rsp: pointer to the spilled rbp in the entry frame
1370 //
1371 // Kills:
1372 //   rbx
1373 //
1374 static void continuation_enter_cleanup(MacroAssembler* masm) {
1375 #ifdef ASSERT
1376   Label L_good_sp;
1377   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1378   __ jcc(Assembler::equal, L_good_sp);
1379   __ stop("Incorrect rsp at continuation_enter_cleanup");
1380   __ bind(L_good_sp);
1381 #endif
1382   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1383   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1384 
1385   if (CheckJNICalls) {
1386     // Check if this is a virtual thread continuation
1387     Label L_skip_vthread_code;
1388     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1389     __ jcc(Assembler::equal, L_skip_vthread_code);
1390 
1391     // If the held monitor count is > 0 and this vthread is terminating then
1392     // it failed to release a JNI monitor. So we issue the same log message
1393     // that JavaThread::exit does.
1394     __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1395     __ jcc(Assembler::equal, L_skip_vthread_code);
1396 
1397     // rax may hold an exception oop, save it before the call
1398     __ push(rax);
1399     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held));
1400     __ pop(rax);
1401 
1402     // For vthreads we have to explicitly zero the JNI monitor count of the carrier
1403     // on termination. The held count is implicitly zeroed below when we restore from
1404     // the parent held count (which has to be zero).
1405     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1406 
1407     __ bind(L_skip_vthread_code);
1408   }
1409 #ifdef ASSERT
1410   else {
1411     // Check if this is a virtual thread continuation
1412     Label L_skip_vthread_code;
1413     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1414     __ jcc(Assembler::equal, L_skip_vthread_code);
1415 
1416     // See comment just above. If not checking JNI calls the JNI count is only
1417     // needed for assertion checking.
1418     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1419 
1420     __ bind(L_skip_vthread_code);
1421   }
1422 #endif
1423 
1424   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1425   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1426 
1427   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1428   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1429   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1430 }
1431 
1432 static void gen_continuation_enter(MacroAssembler* masm,
1433                                    const VMRegPair* regs,
1434                                    int& exception_offset,
1435                                    OopMapSet* oop_maps,
1436                                    int& frame_complete,
1437                                    int& stack_slots,
1438                                    int& interpreted_entry_offset,
1439                                    int& compiled_entry_offset) {
1440 
1441   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1442   int pos_cont_obj   = 0;
1443   int pos_is_cont    = 1;
1444   int pos_is_virtual = 2;
1445 
1446   // The platform-specific calling convention may present the arguments in various registers.
1447   // To simplify the rest of the code, we expect the arguments to reside at these known
1448   // registers, and we additionally check the placement here in case calling convention ever
1449   // changes.
1450   Register reg_cont_obj   = c_rarg1;
1451   Register reg_is_cont    = c_rarg2;
1452   Register reg_is_virtual = c_rarg3;
1453 
1454   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1455   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1456   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1457 
1458   // Utility methods kill rax, make sure there are no collisions
1459   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1460 
1461   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1462                          relocInfo::static_call_type);
1463 
1464   address start = __ pc();
1465 
1466   Label L_thaw, L_exit;
1467 
1468   // i2i entry used at interp_only_mode only
1469   interpreted_entry_offset = __ pc() - start;
1470   {
1471 #ifdef ASSERT
1472     Label is_interp_only;
1473     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1474     __ jcc(Assembler::notEqual, is_interp_only);
1475     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1476     __ bind(is_interp_only);
1477 #endif
1478 
1479     __ pop(rax); // return address
1480     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1481     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1482     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1483     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1484     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1485     __ push(rax); // return address
1486     __ push_cont_fastpath();
1487 
1488     __ enter();
1489 
1490     stack_slots = 2; // will be adjusted in setup
1491     OopMap* map = continuation_enter_setup(masm, stack_slots);
1492     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1493     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1494 
1495     __ verify_oop(reg_cont_obj);
1496 
1497     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1498 
1499     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1500     __ testptr(reg_is_cont, reg_is_cont);
1501     __ jcc(Assembler::notZero, L_thaw);
1502 
1503     // --- Resolve path
1504 
1505     // Make sure the call is patchable
1506     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1507     // Emit stub for static call
1508     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1509     if (stub == nullptr) {
1510       fatal("CodeCache is full at gen_continuation_enter");
1511     }
1512     __ call(resolve);
1513     oop_maps->add_gc_map(__ pc() - start, map);
1514     __ post_call_nop();
1515 
1516     __ jmp(L_exit);
1517   }
1518 
1519   // compiled entry
1520   __ align(CodeEntryAlignment);
1521   compiled_entry_offset = __ pc() - start;
1522   __ enter();
1523 
1524   stack_slots = 2; // will be adjusted in setup
1525   OopMap* map = continuation_enter_setup(masm, stack_slots);
1526 
1527   // Frame is now completed as far as size and linkage.
1528   frame_complete = __ pc() - start;
1529 
1530   __ verify_oop(reg_cont_obj);
1531 
1532   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1533 
1534   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1535   __ testptr(reg_is_cont, reg_is_cont);
1536   __ jccb(Assembler::notZero, L_thaw);
1537 
1538   // --- call Continuation.enter(Continuation c, boolean isContinue)
1539 
1540   // Make sure the call is patchable
1541   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1542 
1543   // Emit stub for static call
1544   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1545   if (stub == nullptr) {
1546     fatal("CodeCache is full at gen_continuation_enter");
1547   }
1548 
1549   // The call needs to be resolved. There's a special case for this in
1550   // SharedRuntime::find_callee_info_helper() which calls
1551   // LinkResolver::resolve_continuation_enter() which resolves the call to
1552   // Continuation.enter(Continuation c, boolean isContinue).
1553   __ call(resolve);
1554 
1555   oop_maps->add_gc_map(__ pc() - start, map);
1556   __ post_call_nop();
1557 
1558   __ jmpb(L_exit);
1559 
1560   // --- Thawing path
1561 
1562   __ bind(L_thaw);
1563 
1564   ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1565   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1566 
1567   ContinuationEntry::_return_pc_offset = __ pc() - start;
1568   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1569   __ post_call_nop();
1570 
1571   // --- Normal exit (resolve/thawing)
1572 
1573   __ bind(L_exit);
1574   ContinuationEntry::_cleanup_offset = __ pc() - start;
1575   continuation_enter_cleanup(masm);
1576   __ pop(rbp);
1577   __ ret(0);
1578 
1579   // --- Exception handling path
1580 
1581   exception_offset = __ pc() - start;
1582 
1583   continuation_enter_cleanup(masm);
1584   __ pop(rbp);
1585 
1586   __ movptr(c_rarg0, r15_thread);
1587   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1588 
1589   // rax still holds the original exception oop, save it before the call
1590   __ push(rax);
1591 
1592   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1593   __ movptr(rbx, rax);
1594 
1595   // Continue at exception handler:
1596   //   rax: exception oop
1597   //   rbx: exception handler
1598   //   rdx: exception pc
1599   __ pop(rax);
1600   __ verify_oop(rax);
1601   __ pop(rdx);
1602   __ jmp(rbx);
1603 }
1604 
1605 static void gen_continuation_yield(MacroAssembler* masm,
1606                                    const VMRegPair* regs,
1607                                    OopMapSet* oop_maps,
1608                                    int& frame_complete,
1609                                    int& stack_slots,
1610                                    int& compiled_entry_offset) {
1611   enum layout {
1612     rbp_off,
1613     rbpH_off,
1614     return_off,
1615     return_off2,
1616     framesize // inclusive of return address
1617   };
1618   stack_slots = framesize /  VMRegImpl::slots_per_word;
1619   assert(stack_slots == 2, "recheck layout");
1620 
1621   address start = __ pc();
1622   compiled_entry_offset = __ pc() - start;
1623   __ enter();
1624   address the_pc = __ pc();
1625 
1626   frame_complete = the_pc - start;
1627 
1628   // This nop must be exactly at the PC we push into the frame info.
1629   // We use this nop for fast CodeBlob lookup, associate the OopMap
1630   // with it right away.
1631   __ post_call_nop();
1632   OopMap* map = new OopMap(framesize, 1);
1633   oop_maps->add_gc_map(frame_complete, map);
1634 
1635   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1636   __ movptr(c_rarg0, r15_thread);
1637   __ movptr(c_rarg1, rsp);
1638   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1639   __ reset_last_Java_frame(true);
1640 
1641   Label L_pinned;
1642 
1643   __ testptr(rax, rax);
1644   __ jcc(Assembler::notZero, L_pinned);
1645 
1646   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1647   continuation_enter_cleanup(masm);
1648   __ pop(rbp);
1649   __ ret(0);
1650 
1651   __ bind(L_pinned);
1652 
1653   // Pinned, return to caller
1654 
1655   // handle pending exception thrown by freeze
1656   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1657   Label ok;
1658   __ jcc(Assembler::equal, ok);
1659   __ leave();
1660   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1661   __ bind(ok);
1662 
1663   __ leave();
1664   __ ret(0);
1665 }
1666 
1667 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1668   ::continuation_enter_cleanup(masm);
1669 }
1670 
1671 static void gen_special_dispatch(MacroAssembler* masm,
1672                                  const methodHandle& method,
1673                                  const BasicType* sig_bt,
1674                                  const VMRegPair* regs) {
1675   verify_oop_args(masm, method, sig_bt, regs);
1676   vmIntrinsics::ID iid = method->intrinsic_id();
1677 
1678   // Now write the args into the outgoing interpreter space
1679   bool     has_receiver   = false;
1680   Register receiver_reg   = noreg;
1681   int      member_arg_pos = -1;
1682   Register member_reg     = noreg;
1683   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1684   if (ref_kind != 0) {
1685     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1686     member_reg = rbx;  // known to be free at this point
1687     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1688   } else if (iid == vmIntrinsics::_invokeBasic) {
1689     has_receiver = true;
1690   } else if (iid == vmIntrinsics::_linkToNative) {
1691     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1692     member_reg = rbx;  // known to be free at this point
1693   } else {
1694     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1695   }
1696 
1697   if (member_reg != noreg) {
1698     // Load the member_arg into register, if necessary.
1699     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1700     VMReg r = regs[member_arg_pos].first();
1701     if (r->is_stack()) {
1702       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1703     } else {
1704       // no data motion is needed
1705       member_reg = r->as_Register();
1706     }
1707   }
1708 
1709   if (has_receiver) {
1710     // Make sure the receiver is loaded into a register.
1711     assert(method->size_of_parameters() > 0, "oob");
1712     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1713     VMReg r = regs[0].first();
1714     assert(r->is_valid(), "bad receiver arg");
1715     if (r->is_stack()) {
1716       // Porting note:  This assumes that compiled calling conventions always
1717       // pass the receiver oop in a register.  If this is not true on some
1718       // platform, pick a temp and load the receiver from stack.
1719       fatal("receiver always in a register");
1720       receiver_reg = j_rarg0;  // known to be free at this point
1721       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1722     } else {
1723       // no data motion is needed
1724       receiver_reg = r->as_Register();
1725     }
1726   }
1727 
1728   // Figure out which address we are really jumping to:
1729   MethodHandles::generate_method_handle_dispatch(masm, iid,
1730                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1731 }
1732 
1733 // ---------------------------------------------------------------------------
1734 // Generate a native wrapper for a given method.  The method takes arguments
1735 // in the Java compiled code convention, marshals them to the native
1736 // convention (handlizes oops, etc), transitions to native, makes the call,
1737 // returns to java state (possibly blocking), unhandlizes any result and
1738 // returns.
1739 //
1740 // Critical native functions are a shorthand for the use of
1741 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1742 // functions.  The wrapper is expected to unpack the arguments before
1743 // passing them to the callee. Critical native functions leave the state _in_Java,
1744 // since they cannot stop for GC.
1745 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1746 // block and the check for pending exceptions it's impossible for them
1747 // to be thrown.
1748 //
1749 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1750                                                 const methodHandle& method,
1751                                                 int compile_id,
1752                                                 BasicType* in_sig_bt,
1753                                                 VMRegPair* in_regs,
1754                                                 BasicType ret_type) {
1755   if (method->is_continuation_native_intrinsic()) {
1756     int exception_offset = -1;
1757     OopMapSet* oop_maps = new OopMapSet();
1758     int frame_complete = -1;
1759     int stack_slots = -1;
1760     int interpreted_entry_offset = -1;
1761     int vep_offset = -1;
1762     if (method->is_continuation_enter_intrinsic()) {
1763       gen_continuation_enter(masm,
1764                              in_regs,
1765                              exception_offset,
1766                              oop_maps,
1767                              frame_complete,
1768                              stack_slots,
1769                              interpreted_entry_offset,
1770                              vep_offset);
1771     } else if (method->is_continuation_yield_intrinsic()) {
1772       gen_continuation_yield(masm,
1773                              in_regs,
1774                              oop_maps,
1775                              frame_complete,
1776                              stack_slots,
1777                              vep_offset);
1778     } else {
1779       guarantee(false, "Unknown Continuation native intrinsic");
1780     }
1781 
1782 #ifdef ASSERT
1783     if (method->is_continuation_enter_intrinsic()) {
1784       assert(interpreted_entry_offset != -1, "Must be set");
1785       assert(exception_offset != -1,         "Must be set");
1786     } else {
1787       assert(interpreted_entry_offset == -1, "Must be unset");
1788       assert(exception_offset == -1,         "Must be unset");
1789     }
1790     assert(frame_complete != -1,    "Must be set");
1791     assert(stack_slots != -1,       "Must be set");
1792     assert(vep_offset != -1,        "Must be set");
1793 #endif
1794 
1795     __ flush();
1796     nmethod* nm = nmethod::new_native_nmethod(method,
1797                                               compile_id,
1798                                               masm->code(),
1799                                               vep_offset,
1800                                               frame_complete,
1801                                               stack_slots,
1802                                               in_ByteSize(-1),
1803                                               in_ByteSize(-1),
1804                                               oop_maps,
1805                                               exception_offset);
1806     if (nm == nullptr) return nm;
1807     if (method->is_continuation_enter_intrinsic()) {
1808       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1809     } else if (method->is_continuation_yield_intrinsic()) {
1810       _cont_doYield_stub = nm;
1811     }
1812     return nm;
1813   }
1814 
1815   if (method->is_method_handle_intrinsic()) {
1816     vmIntrinsics::ID iid = method->intrinsic_id();
1817     intptr_t start = (intptr_t)__ pc();
1818     int vep_offset = ((intptr_t)__ pc()) - start;
1819     gen_special_dispatch(masm,
1820                          method,
1821                          in_sig_bt,
1822                          in_regs);
1823     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1824     __ flush();
1825     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1826     return nmethod::new_native_nmethod(method,
1827                                        compile_id,
1828                                        masm->code(),
1829                                        vep_offset,
1830                                        frame_complete,
1831                                        stack_slots / VMRegImpl::slots_per_word,
1832                                        in_ByteSize(-1),
1833                                        in_ByteSize(-1),
1834                                        nullptr);
1835   }
1836   address native_func = method->native_function();
1837   assert(native_func != nullptr, "must have function");
1838 
1839   // An OopMap for lock (and class if static)
1840   OopMapSet *oop_maps = new OopMapSet();
1841   intptr_t start = (intptr_t)__ pc();
1842 
1843   // We have received a description of where all the java arg are located
1844   // on entry to the wrapper. We need to convert these args to where
1845   // the jni function will expect them. To figure out where they go
1846   // we convert the java signature to a C signature by inserting
1847   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1848 
1849   const int total_in_args = method->size_of_parameters();
1850   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1851 
1852   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1853   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1854 
1855   int argc = 0;
1856   out_sig_bt[argc++] = T_ADDRESS;
1857   if (method->is_static()) {
1858     out_sig_bt[argc++] = T_OBJECT;
1859   }
1860 
1861   for (int i = 0; i < total_in_args ; i++ ) {
1862     out_sig_bt[argc++] = in_sig_bt[i];
1863   }
1864 
1865   // Now figure out where the args must be stored and how much stack space
1866   // they require.
1867   int out_arg_slots;
1868   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1869 
1870   // Compute framesize for the wrapper.  We need to handlize all oops in
1871   // incoming registers
1872 
1873   // Calculate the total number of stack slots we will need.
1874 
1875   // First count the abi requirement plus all of the outgoing args
1876   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1877 
1878   // Now the space for the inbound oop handle area
1879   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1880 
1881   int oop_handle_offset = stack_slots;
1882   stack_slots += total_save_slots;
1883 
1884   // Now any space we need for handlizing a klass if static method
1885 
1886   int klass_slot_offset = 0;
1887   int klass_offset = -1;
1888   int lock_slot_offset = 0;
1889   bool is_static = false;
1890 
1891   if (method->is_static()) {
1892     klass_slot_offset = stack_slots;
1893     stack_slots += VMRegImpl::slots_per_word;
1894     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1895     is_static = true;
1896   }
1897 
1898   // Plus a lock if needed
1899 
1900   if (method->is_synchronized()) {
1901     lock_slot_offset = stack_slots;
1902     stack_slots += VMRegImpl::slots_per_word;
1903   }
1904 
1905   // Now a place (+2) to save return values or temp during shuffling
1906   // + 4 for return address (which we own) and saved rbp
1907   stack_slots += 6;
1908 
1909   // Ok The space we have allocated will look like:
1910   //
1911   //
1912   // FP-> |                     |
1913   //      |---------------------|
1914   //      | 2 slots for moves   |
1915   //      |---------------------|
1916   //      | lock box (if sync)  |
1917   //      |---------------------| <- lock_slot_offset
1918   //      | klass (if static)   |
1919   //      |---------------------| <- klass_slot_offset
1920   //      | oopHandle area      |
1921   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1922   //      | outbound memory     |
1923   //      | based arguments     |
1924   //      |                     |
1925   //      |---------------------|
1926   //      |                     |
1927   // SP-> | out_preserved_slots |
1928   //
1929   //
1930 
1931 
1932   // Now compute actual number of stack words we need rounding to make
1933   // stack properly aligned.
1934   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1935 
1936   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1937 
1938   // First thing make an ic check to see if we should even be here
1939 
1940   // We are free to use all registers as temps without saving them and
1941   // restoring them except rbp. rbp is the only callee save register
1942   // as far as the interpreter and the compiler(s) are concerned.
1943 
1944   const Register receiver = j_rarg0;
1945 
1946   Label exception_pending;
1947 
1948   assert_different_registers(receiver, rscratch1, rscratch2);
1949   __ verify_oop(receiver);
1950   __ ic_check(8 /* end_alignment */);
1951 
1952   int vep_offset = ((intptr_t)__ pc()) - start;
1953 
1954   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1955     Label L_skip_barrier;
1956     Register klass = r10;
1957     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1958     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1959 
1960     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1961 
1962     __ bind(L_skip_barrier);
1963   }
1964 
1965 #ifdef COMPILER1
1966   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1967   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1968     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1969   }
1970 #endif // COMPILER1
1971 
1972   // The instruction at the verified entry point must be 5 bytes or longer
1973   // because it can be patched on the fly by make_non_entrant. The stack bang
1974   // instruction fits that requirement.
1975 
1976   // Generate stack overflow check
1977   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1978 
1979   // Generate a new frame for the wrapper.
1980   __ enter();
1981   // -2 because return address is already present and so is saved rbp
1982   __ subptr(rsp, stack_size - 2*wordSize);
1983 
1984   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1985   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
1986   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
1987 
1988   // Frame is now completed as far as size and linkage.
1989   int frame_complete = ((intptr_t)__ pc()) - start;
1990 
1991 #ifdef ASSERT
1992   __ check_stack_alignment(rsp, "improperly aligned stack");
1993 #endif /* ASSERT */
1994 
1995 
1996   // We use r14 as the oop handle for the receiver/klass
1997   // It is callee save so it survives the call to native
1998 
1999   const Register oop_handle_reg = r14;
2000 
2001   //
2002   // We immediately shuffle the arguments so that any vm call we have to
2003   // make from here on out (sync slow path, jvmti, etc.) we will have
2004   // captured the oops from our caller and have a valid oopMap for
2005   // them.
2006 
2007   // -----------------
2008   // The Grand Shuffle
2009 
2010   // The Java calling convention is either equal (linux) or denser (win64) than the
2011   // c calling convention. However the because of the jni_env argument the c calling
2012   // convention always has at least one more (and two for static) arguments than Java.
2013   // Therefore if we move the args from java -> c backwards then we will never have
2014   // a register->register conflict and we don't have to build a dependency graph
2015   // and figure out how to break any cycles.
2016   //
2017 
2018   // Record esp-based slot for receiver on stack for non-static methods
2019   int receiver_offset = -1;
2020 
2021   // This is a trick. We double the stack slots so we can claim
2022   // the oops in the caller's frame. Since we are sure to have
2023   // more args than the caller doubling is enough to make
2024   // sure we can capture all the incoming oop args from the
2025   // caller.
2026   //
2027   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2028 
2029   // Mark location of rbp (someday)
2030   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2031 
2032   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2033   // All inbound args are referenced based on rbp and all outbound args via rsp.
2034 
2035 
2036 #ifdef ASSERT
2037   bool reg_destroyed[Register::number_of_registers];
2038   bool freg_destroyed[XMMRegister::number_of_registers];
2039   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2040     reg_destroyed[r] = false;
2041   }
2042   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2043     freg_destroyed[f] = false;
2044   }
2045 
2046 #endif /* ASSERT */
2047 
2048   // For JNI natives the incoming and outgoing registers are offset upwards.
2049   GrowableArray<int> arg_order(2 * total_in_args);
2050 
2051   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2052     arg_order.push(i);
2053     arg_order.push(c_arg);
2054   }
2055 
2056   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2057     int i = arg_order.at(ai);
2058     int c_arg = arg_order.at(ai + 1);
2059     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2060 #ifdef ASSERT
2061     if (in_regs[i].first()->is_Register()) {
2062       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2063     } else if (in_regs[i].first()->is_XMMRegister()) {
2064       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2065     }
2066     if (out_regs[c_arg].first()->is_Register()) {
2067       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2068     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2069       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2070     }
2071 #endif /* ASSERT */
2072     switch (in_sig_bt[i]) {
2073       case T_ARRAY:
2074       case T_OBJECT:
2075         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2076                     ((i == 0) && (!is_static)),
2077                     &receiver_offset);
2078         break;
2079       case T_VOID:
2080         break;
2081 
2082       case T_FLOAT:
2083         __ float_move(in_regs[i], out_regs[c_arg]);
2084           break;
2085 
2086       case T_DOUBLE:
2087         assert( i + 1 < total_in_args &&
2088                 in_sig_bt[i + 1] == T_VOID &&
2089                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2090         __ double_move(in_regs[i], out_regs[c_arg]);
2091         break;
2092 
2093       case T_LONG :
2094         __ long_move(in_regs[i], out_regs[c_arg]);
2095         break;
2096 
2097       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2098 
2099       default:
2100         __ move32_64(in_regs[i], out_regs[c_arg]);
2101     }
2102   }
2103 
2104   int c_arg;
2105 
2106   // Pre-load a static method's oop into r14.  Used both by locking code and
2107   // the normal JNI call code.
2108   // point c_arg at the first arg that is already loaded in case we
2109   // need to spill before we call out
2110   c_arg = total_c_args - total_in_args;
2111 
2112   if (method->is_static()) {
2113 
2114     //  load oop into a register
2115     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2116 
2117     // Now handlize the static class mirror it's known not-null.
2118     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2119     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2120 
2121     // Now get the handle
2122     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2123     // store the klass handle as second argument
2124     __ movptr(c_rarg1, oop_handle_reg);
2125     // and protect the arg if we must spill
2126     c_arg--;
2127   }
2128 
2129   // Change state to native (we save the return address in the thread, since it might not
2130   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2131   // points into the right code segment. It does not have to be the correct return pc.
2132   // We use the same pc/oopMap repeatedly when we call out
2133 
2134   Label native_return;
2135   if (LockingMode != LM_LEGACY && method->is_object_wait0()) {
2136     // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2137     __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2138   } else {
2139     intptr_t the_pc = (intptr_t) __ pc();
2140     oop_maps->add_gc_map(the_pc - start, map);
2141 
2142     __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2143   }
2144 
2145   // We have all of the arguments setup at this point. We must not touch any register
2146   // argument registers at this point (what if we save/restore them there are no oop?
2147 
2148   if (DTraceMethodProbes) {
2149     // protect the args we've loaded
2150     save_args(masm, total_c_args, c_arg, out_regs);
2151     __ mov_metadata(c_rarg1, method());
2152     __ call_VM_leaf(
2153       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2154       r15_thread, c_rarg1);
2155     restore_args(masm, total_c_args, c_arg, out_regs);
2156   }
2157 
2158   // RedefineClasses() tracing support for obsolete method entry
2159   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2160     // protect the args we've loaded
2161     save_args(masm, total_c_args, c_arg, out_regs);
2162     __ mov_metadata(c_rarg1, method());
2163     __ call_VM_leaf(
2164       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2165       r15_thread, c_rarg1);
2166     restore_args(masm, total_c_args, c_arg, out_regs);
2167   }
2168 
2169   // Lock a synchronized method
2170 
2171   // Register definitions used by locking and unlocking
2172 
2173   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2174   const Register obj_reg  = rbx;  // Will contain the oop
2175   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2176   const Register old_hdr  = r13;  // value of old header at unlock time
2177 
2178   Label slow_path_lock;
2179   Label lock_done;
2180 
2181   if (method->is_synchronized()) {
2182     Label count_mon;
2183 
2184     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2185 
2186     // Get the handle (the 2nd argument)
2187     __ mov(oop_handle_reg, c_rarg1);
2188 
2189     // Get address of the box
2190 
2191     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2192 
2193     // Load the oop from the handle
2194     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2195 
2196     if (LockingMode == LM_MONITOR) {
2197       __ jmp(slow_path_lock);
2198     } else if (LockingMode == LM_LEGACY) {
2199       // Load immediate 1 into swap_reg %rax
2200       __ movl(swap_reg, 1);
2201 
2202       // Load (object->mark() | 1) into swap_reg %rax
2203       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2204 
2205       // Save (object->mark() | 1) into BasicLock's displaced header
2206       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2207 
2208       // src -> dest iff dest == rax else rax <- dest
2209       __ lock();
2210       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2211       __ jcc(Assembler::equal, count_mon);
2212 
2213       // Hmm should this move to the slow path code area???
2214 
2215       // Test if the oopMark is an obvious stack pointer, i.e.,
2216       //  1) (mark & 3) == 0, and
2217       //  2) rsp <= mark < mark + os::pagesize()
2218       // These 3 tests can be done by evaluating the following
2219       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2220       // assuming both stack pointer and pagesize have their
2221       // least significant 2 bits clear.
2222       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2223 
2224       __ subptr(swap_reg, rsp);
2225       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2226 
2227       // Save the test result, for recursive case, the result is zero
2228       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2229       __ jcc(Assembler::notEqual, slow_path_lock);
2230 
2231       __ bind(count_mon);
2232       __ inc_held_monitor_count();
2233     } else {
2234       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2235       __ lightweight_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock);
2236     }
2237 
2238     // Slow path will re-enter here
2239     __ bind(lock_done);
2240   }
2241 
2242   // Finally just about ready to make the JNI call
2243 
2244   // get JNIEnv* which is first argument to native
2245   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2246 
2247   // Now set thread in native
2248   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2249 
2250   __ call(RuntimeAddress(native_func));
2251 
2252   // Verify or restore cpu control state after JNI call
2253   __ restore_cpu_control_state_after_jni(rscratch1);
2254 
2255   // Unpack native results.
2256   switch (ret_type) {
2257   case T_BOOLEAN: __ c2bool(rax);            break;
2258   case T_CHAR   : __ movzwl(rax, rax);      break;
2259   case T_BYTE   : __ sign_extend_byte (rax); break;
2260   case T_SHORT  : __ sign_extend_short(rax); break;
2261   case T_INT    : /* nothing to do */        break;
2262   case T_DOUBLE :
2263   case T_FLOAT  :
2264     // Result is in xmm0 we'll save as needed
2265     break;
2266   case T_ARRAY:                 // Really a handle
2267   case T_OBJECT:                // Really a handle
2268       break; // can't de-handlize until after safepoint check
2269   case T_VOID: break;
2270   case T_LONG: break;
2271   default       : ShouldNotReachHere();
2272   }
2273 
2274   // Switch thread to "native transition" state before reading the synchronization state.
2275   // This additional state is necessary because reading and testing the synchronization
2276   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2277   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2278   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2279   //     Thread A is resumed to finish this native method, but doesn't block here since it
2280   //     didn't see any synchronization is progress, and escapes.
2281   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2282 
2283   // Force this write out before the read below
2284   if (!UseSystemMemoryBarrier) {
2285     __ membar(Assembler::Membar_mask_bits(
2286               Assembler::LoadLoad | Assembler::LoadStore |
2287               Assembler::StoreLoad | Assembler::StoreStore));
2288   }
2289 
2290   // check for safepoint operation in progress and/or pending suspend requests
2291   {
2292     Label Continue;
2293     Label slow_path;
2294 
2295     __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */);
2296 
2297     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2298     __ jcc(Assembler::equal, Continue);
2299     __ bind(slow_path);
2300 
2301     // Don't use call_VM as it will see a possible pending exception and forward it
2302     // and never return here preventing us from clearing _last_native_pc down below.
2303     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2304     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2305     // by hand.
2306     //
2307     __ vzeroupper();
2308     save_native_result(masm, ret_type, stack_slots);
2309     __ mov(c_rarg0, r15_thread);
2310     __ mov(r12, rsp); // remember sp
2311     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2312     __ andptr(rsp, -16); // align stack as required by ABI
2313     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2314     __ mov(rsp, r12); // restore sp
2315     __ reinit_heapbase();
2316     // Restore any method result value
2317     restore_native_result(masm, ret_type, stack_slots);
2318     __ bind(Continue);
2319   }
2320 
2321   // change thread state
2322   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2323 
2324   if (LockingMode != LM_LEGACY && method->is_object_wait0()) {
2325     // Check preemption for Object.wait()
2326     __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2327     __ cmpptr(rscratch1, NULL_WORD);
2328     __ jccb(Assembler::equal, native_return);
2329     __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2330     __ jmp(rscratch1);
2331     __ bind(native_return);
2332 
2333     intptr_t the_pc = (intptr_t) __ pc();
2334     oop_maps->add_gc_map(the_pc - start, map);
2335   }
2336 
2337 
2338   Label reguard;
2339   Label reguard_done;
2340   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2341   __ jcc(Assembler::equal, reguard);
2342   __ bind(reguard_done);
2343 
2344   // native result if any is live
2345 
2346   // Unlock
2347   Label slow_path_unlock;
2348   Label unlock_done;
2349   if (method->is_synchronized()) {
2350 
2351     Label fast_done;
2352 
2353     // Get locked oop from the handle we passed to jni
2354     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2355 
2356     if (LockingMode == LM_LEGACY) {
2357       Label not_recur;
2358       // Simple recursive lock?
2359       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2360       __ jcc(Assembler::notEqual, not_recur);
2361       __ dec_held_monitor_count();
2362       __ jmpb(fast_done);
2363       __ bind(not_recur);
2364     }
2365 
2366     // Must save rax if it is live now because cmpxchg must use it
2367     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2368       save_native_result(masm, ret_type, stack_slots);
2369     }
2370 
2371     if (LockingMode == LM_MONITOR) {
2372       __ jmp(slow_path_unlock);
2373     } else if (LockingMode == LM_LEGACY) {
2374       // get address of the stack lock
2375       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2376       //  get old displaced header
2377       __ movptr(old_hdr, Address(rax, 0));
2378 
2379       // Atomic swap old header if oop still contains the stack lock
2380       __ lock();
2381       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2382       __ jcc(Assembler::notEqual, slow_path_unlock);
2383       __ dec_held_monitor_count();
2384     } else {
2385       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2386       __ lightweight_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2387     }
2388 
2389     // slow path re-enters here
2390     __ bind(unlock_done);
2391     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2392       restore_native_result(masm, ret_type, stack_slots);
2393     }
2394 
2395     __ bind(fast_done);
2396   }
2397   if (DTraceMethodProbes) {
2398     save_native_result(masm, ret_type, stack_slots);
2399     __ mov_metadata(c_rarg1, method());
2400     __ call_VM_leaf(
2401          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2402          r15_thread, c_rarg1);
2403     restore_native_result(masm, ret_type, stack_slots);
2404   }
2405 
2406   __ reset_last_Java_frame(false);
2407 
2408   // Unbox oop result, e.g. JNIHandles::resolve value.
2409   if (is_reference_type(ret_type)) {
2410     __ resolve_jobject(rax /* value */,
2411                        rcx /* tmp */);
2412   }
2413 
2414   if (CheckJNICalls) {
2415     // clear_pending_jni_exception_check
2416     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2417   }
2418 
2419   // reset handle block
2420   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2421   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2422 
2423   // pop our frame
2424 
2425   __ leave();
2426 
2427   // Any exception pending?
2428   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2429   __ jcc(Assembler::notEqual, exception_pending);
2430 
2431   // Return
2432 
2433   __ ret(0);
2434 
2435   // Unexpected paths are out of line and go here
2436 
2437   // forward the exception
2438   __ bind(exception_pending);
2439 
2440   // and forward the exception
2441   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2442 
2443   // Slow path locking & unlocking
2444   if (method->is_synchronized()) {
2445 
2446     // BEGIN Slow path lock
2447     __ bind(slow_path_lock);
2448 
2449     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2450     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2451 
2452     // protect the args we've loaded
2453     save_args(masm, total_c_args, c_arg, out_regs);
2454 
2455     __ mov(c_rarg0, obj_reg);
2456     __ mov(c_rarg1, lock_reg);
2457     __ mov(c_rarg2, r15_thread);
2458 
2459     // Not a leaf but we have last_Java_frame setup as we want.
2460     // We don't want to unmount in case of contention since that would complicate preserving
2461     // the arguments that had already been marshalled into the native convention. So we force
2462     // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2463     // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2464     __ push_cont_fastpath();
2465     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2466     __ pop_cont_fastpath();
2467     restore_args(masm, total_c_args, c_arg, out_regs);
2468 
2469 #ifdef ASSERT
2470     { Label L;
2471     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2472     __ jcc(Assembler::equal, L);
2473     __ stop("no pending exception allowed on exit from monitorenter");
2474     __ bind(L);
2475     }
2476 #endif
2477     __ jmp(lock_done);
2478 
2479     // END Slow path lock
2480 
2481     // BEGIN Slow path unlock
2482     __ bind(slow_path_unlock);
2483 
2484     // If we haven't already saved the native result we must save it now as xmm registers
2485     // are still exposed.
2486     __ vzeroupper();
2487     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2488       save_native_result(masm, ret_type, stack_slots);
2489     }
2490 
2491     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2492 
2493     __ mov(c_rarg0, obj_reg);
2494     __ mov(c_rarg2, r15_thread);
2495     __ mov(r12, rsp); // remember sp
2496     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2497     __ andptr(rsp, -16); // align stack as required by ABI
2498 
2499     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2500     // NOTE that obj_reg == rbx currently
2501     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2502     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2503 
2504     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2505     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2506     __ mov(rsp, r12); // restore sp
2507     __ reinit_heapbase();
2508 #ifdef ASSERT
2509     {
2510       Label L;
2511       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2512       __ jcc(Assembler::equal, L);
2513       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2514       __ bind(L);
2515     }
2516 #endif /* ASSERT */
2517 
2518     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2519 
2520     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2521       restore_native_result(masm, ret_type, stack_slots);
2522     }
2523     __ jmp(unlock_done);
2524 
2525     // END Slow path unlock
2526 
2527   } // synchronized
2528 
2529   // SLOW PATH Reguard the stack if needed
2530 
2531   __ bind(reguard);
2532   __ vzeroupper();
2533   save_native_result(masm, ret_type, stack_slots);
2534   __ mov(r12, rsp); // remember sp
2535   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2536   __ andptr(rsp, -16); // align stack as required by ABI
2537   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2538   __ mov(rsp, r12); // restore sp
2539   __ reinit_heapbase();
2540   restore_native_result(masm, ret_type, stack_slots);
2541   // and continue
2542   __ jmp(reguard_done);
2543 
2544 
2545 
2546   __ flush();
2547 
2548   nmethod *nm = nmethod::new_native_nmethod(method,
2549                                             compile_id,
2550                                             masm->code(),
2551                                             vep_offset,
2552                                             frame_complete,
2553                                             stack_slots / VMRegImpl::slots_per_word,
2554                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2555                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2556                                             oop_maps);
2557 
2558   return nm;
2559 }
2560 
2561 // this function returns the adjust size (in number of words) to a c2i adapter
2562 // activation for use during deoptimization
2563 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2564   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2565 }
2566 
2567 
2568 uint SharedRuntime::out_preserve_stack_slots() {
2569   return 0;
2570 }
2571 
2572 
2573 // Number of stack slots between incoming argument block and the start of
2574 // a new frame.  The PROLOG must add this many slots to the stack.  The
2575 // EPILOG must remove this many slots.  amd64 needs two slots for
2576 // return address.
2577 uint SharedRuntime::in_preserve_stack_slots() {
2578   return 4 + 2 * VerifyStackAtCalls;
2579 }
2580 
2581 VMReg SharedRuntime::thread_register() {
2582   return r15_thread->as_VMReg();
2583 }
2584 
2585 //------------------------------generate_deopt_blob----------------------------
2586 void SharedRuntime::generate_deopt_blob() {
2587   // Allocate space for the code
2588   ResourceMark rm;
2589   // Setup code generation tools
2590   int pad = 0;
2591   if (UseAVX > 2) {
2592     pad += 1024;
2593   }
2594   if (UseAPX) {
2595     pad += 1024;
2596   }
2597 #if INCLUDE_JVMCI
2598   if (EnableJVMCI) {
2599     pad += 512; // Increase the buffer size when compiling for JVMCI
2600   }
2601 #endif
2602   const char* name = SharedRuntime::stub_name(SharedStubId::deopt_id);
2603   CodeBuffer buffer(name, 2560+pad, 1024);
2604   MacroAssembler* masm = new MacroAssembler(&buffer);
2605   int frame_size_in_words;
2606   OopMap* map = nullptr;
2607   OopMapSet *oop_maps = new OopMapSet();
2608 
2609   // -------------
2610   // This code enters when returning to a de-optimized nmethod.  A return
2611   // address has been pushed on the stack, and return values are in
2612   // registers.
2613   // If we are doing a normal deopt then we were called from the patched
2614   // nmethod from the point we returned to the nmethod. So the return
2615   // address on the stack is wrong by NativeCall::instruction_size
2616   // We will adjust the value so it looks like we have the original return
2617   // address on the stack (like when we eagerly deoptimized).
2618   // In the case of an exception pending when deoptimizing, we enter
2619   // with a return address on the stack that points after the call we patched
2620   // into the exception handler. We have the following register state from,
2621   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2622   //    rax: exception oop
2623   //    rbx: exception handler
2624   //    rdx: throwing pc
2625   // So in this case we simply jam rdx into the useless return address and
2626   // the stack looks just like we want.
2627   //
2628   // At this point we need to de-opt.  We save the argument return
2629   // registers.  We call the first C routine, fetch_unroll_info().  This
2630   // routine captures the return values and returns a structure which
2631   // describes the current frame size and the sizes of all replacement frames.
2632   // The current frame is compiled code and may contain many inlined
2633   // functions, each with their own JVM state.  We pop the current frame, then
2634   // push all the new frames.  Then we call the C routine unpack_frames() to
2635   // populate these frames.  Finally unpack_frames() returns us the new target
2636   // address.  Notice that callee-save registers are BLOWN here; they have
2637   // already been captured in the vframeArray at the time the return PC was
2638   // patched.
2639   address start = __ pc();
2640   Label cont;
2641 
2642   // Prolog for non exception case!
2643 
2644   // Save everything in sight.
2645   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2646 
2647   // Normal deoptimization.  Save exec mode for unpack_frames.
2648   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2649   __ jmp(cont);
2650 
2651   int reexecute_offset = __ pc() - start;
2652 #if INCLUDE_JVMCI && !defined(COMPILER1)
2653   if (UseJVMCICompiler) {
2654     // JVMCI does not use this kind of deoptimization
2655     __ should_not_reach_here();
2656   }
2657 #endif
2658 
2659   // Reexecute case
2660   // return address is the pc describes what bci to do re-execute at
2661 
2662   // No need to update map as each call to save_live_registers will produce identical oopmap
2663   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2664 
2665   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2666   __ jmp(cont);
2667 
2668 #if INCLUDE_JVMCI
2669   Label after_fetch_unroll_info_call;
2670   int implicit_exception_uncommon_trap_offset = 0;
2671   int uncommon_trap_offset = 0;
2672 
2673   if (EnableJVMCI) {
2674     implicit_exception_uncommon_trap_offset = __ pc() - start;
2675 
2676     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2677     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2678 
2679     uncommon_trap_offset = __ pc() - start;
2680 
2681     // Save everything in sight.
2682     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2683     // fetch_unroll_info needs to call last_java_frame()
2684     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2685 
2686     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2687     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2688 
2689     __ movl(r14, Deoptimization::Unpack_reexecute);
2690     __ mov(c_rarg0, r15_thread);
2691     __ movl(c_rarg2, r14); // exec mode
2692     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2693     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2694 
2695     __ reset_last_Java_frame(false);
2696 
2697     __ jmp(after_fetch_unroll_info_call);
2698   } // EnableJVMCI
2699 #endif // INCLUDE_JVMCI
2700 
2701   int exception_offset = __ pc() - start;
2702 
2703   // Prolog for exception case
2704 
2705   // all registers are dead at this entry point, except for rax, and
2706   // rdx which contain the exception oop and exception pc
2707   // respectively.  Set them in TLS and fall thru to the
2708   // unpack_with_exception_in_tls entry point.
2709 
2710   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2711   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2712 
2713   int exception_in_tls_offset = __ pc() - start;
2714 
2715   // new implementation because exception oop is now passed in JavaThread
2716 
2717   // Prolog for exception case
2718   // All registers must be preserved because they might be used by LinearScan
2719   // Exceptiop oop and throwing PC are passed in JavaThread
2720   // tos: stack at point of call to method that threw the exception (i.e. only
2721   // args are on the stack, no return address)
2722 
2723   // make room on stack for the return address
2724   // It will be patched later with the throwing pc. The correct value is not
2725   // available now because loading it from memory would destroy registers.
2726   __ push(0);
2727 
2728   // Save everything in sight.
2729   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2730 
2731   // Now it is safe to overwrite any register
2732 
2733   // Deopt during an exception.  Save exec mode for unpack_frames.
2734   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2735 
2736   // load throwing pc from JavaThread and patch it as the return address
2737   // of the current frame. Then clear the field in JavaThread
2738 
2739   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2740   __ movptr(Address(rbp, wordSize), rdx);
2741   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2742 
2743 #ifdef ASSERT
2744   // verify that there is really an exception oop in JavaThread
2745   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2746   __ verify_oop(rax);
2747 
2748   // verify that there is no pending exception
2749   Label no_pending_exception;
2750   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2751   __ testptr(rax, rax);
2752   __ jcc(Assembler::zero, no_pending_exception);
2753   __ stop("must not have pending exception here");
2754   __ bind(no_pending_exception);
2755 #endif
2756 
2757   __ bind(cont);
2758 
2759   // Call C code.  Need thread and this frame, but NOT official VM entry
2760   // crud.  We cannot block on this call, no GC can happen.
2761   //
2762   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2763 
2764   // fetch_unroll_info needs to call last_java_frame().
2765 
2766   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2767 #ifdef ASSERT
2768   { Label L;
2769     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2770     __ jcc(Assembler::equal, L);
2771     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2772     __ bind(L);
2773   }
2774 #endif // ASSERT
2775   __ mov(c_rarg0, r15_thread);
2776   __ movl(c_rarg1, r14); // exec_mode
2777   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2778 
2779   // Need to have an oopmap that tells fetch_unroll_info where to
2780   // find any register it might need.
2781   oop_maps->add_gc_map(__ pc() - start, map);
2782 
2783   __ reset_last_Java_frame(false);
2784 
2785 #if INCLUDE_JVMCI
2786   if (EnableJVMCI) {
2787     __ bind(after_fetch_unroll_info_call);
2788   }
2789 #endif
2790 
2791   // Load UnrollBlock* into rdi
2792   __ mov(rdi, rax);
2793 
2794   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2795    Label noException;
2796   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2797   __ jcc(Assembler::notEqual, noException);
2798   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2799   // QQQ this is useless it was null above
2800   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2801   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2802   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2803 
2804   __ verify_oop(rax);
2805 
2806   // Overwrite the result registers with the exception results.
2807   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2808   // I think this is useless
2809   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2810 
2811   __ bind(noException);
2812 
2813   // Only register save data is on the stack.
2814   // Now restore the result registers.  Everything else is either dead
2815   // or captured in the vframeArray.
2816   RegisterSaver::restore_result_registers(masm);
2817 
2818   // All of the register save area has been popped of the stack. Only the
2819   // return address remains.
2820 
2821   // Pop all the frames we must move/replace.
2822   //
2823   // Frame picture (youngest to oldest)
2824   // 1: self-frame (no frame link)
2825   // 2: deopting frame  (no frame link)
2826   // 3: caller of deopting frame (could be compiled/interpreted).
2827   //
2828   // Note: by leaving the return address of self-frame on the stack
2829   // and using the size of frame 2 to adjust the stack
2830   // when we are done the return to frame 3 will still be on the stack.
2831 
2832   // Pop deoptimized frame
2833   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2834   __ addptr(rsp, rcx);
2835 
2836   // rsp should be pointing at the return address to the caller (3)
2837 
2838   // Pick up the initial fp we should save
2839   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2840   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2841 
2842 #ifdef ASSERT
2843   // Compilers generate code that bang the stack by as much as the
2844   // interpreter would need. So this stack banging should never
2845   // trigger a fault. Verify that it does not on non product builds.
2846   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2847   __ bang_stack_size(rbx, rcx);
2848 #endif
2849 
2850   // Load address of array of frame pcs into rcx
2851   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2852 
2853   // Trash the old pc
2854   __ addptr(rsp, wordSize);
2855 
2856   // Load address of array of frame sizes into rsi
2857   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2858 
2859   // Load counter into rdx
2860   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2861 
2862   // Now adjust the caller's stack to make up for the extra locals
2863   // but record the original sp so that we can save it in the skeletal interpreter
2864   // frame and the stack walking of interpreter_sender will get the unextended sp
2865   // value and not the "real" sp value.
2866 
2867   const Register sender_sp = r8;
2868 
2869   __ mov(sender_sp, rsp);
2870   __ movl(rbx, Address(rdi,
2871                        Deoptimization::UnrollBlock::
2872                        caller_adjustment_offset()));
2873   __ subptr(rsp, rbx);
2874 
2875   // Push interpreter frames in a loop
2876   Label loop;
2877   __ bind(loop);
2878   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2879   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2880   __ pushptr(Address(rcx, 0));          // Save return address
2881   __ enter();                           // Save old & set new ebp
2882   __ subptr(rsp, rbx);                  // Prolog
2883   // This value is corrected by layout_activation_impl
2884   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2885   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2886   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2887   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2888   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2889   __ decrementl(rdx);                   // Decrement counter
2890   __ jcc(Assembler::notZero, loop);
2891   __ pushptr(Address(rcx, 0));          // Save final return address
2892 
2893   // Re-push self-frame
2894   __ enter();                           // Save old & set new ebp
2895 
2896   // Allocate a full sized register save area.
2897   // Return address and rbp are in place, so we allocate two less words.
2898   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2899 
2900   // Restore frame locals after moving the frame
2901   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2902   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2903 
2904   // Call C code.  Need thread but NOT official VM entry
2905   // crud.  We cannot block on this call, no GC can happen.  Call should
2906   // restore return values to their stack-slots with the new SP.
2907   //
2908   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2909 
2910   // Use rbp because the frames look interpreted now
2911   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2912   // Don't need the precise return PC here, just precise enough to point into this code blob.
2913   address the_pc = __ pc();
2914   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2915 
2916   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2917   __ mov(c_rarg0, r15_thread);
2918   __ movl(c_rarg1, r14); // second arg: exec_mode
2919   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2920   // Revert SP alignment after call since we're going to do some SP relative addressing below
2921   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2922 
2923   // Set an oopmap for the call site
2924   // Use the same PC we used for the last java frame
2925   oop_maps->add_gc_map(the_pc - start,
2926                        new OopMap( frame_size_in_words, 0 ));
2927 
2928   // Clear fp AND pc
2929   __ reset_last_Java_frame(true);
2930 
2931   // Collect return values
2932   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2933   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2934   // I think this is useless (throwing pc?)
2935   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2936 
2937   // Pop self-frame.
2938   __ leave();                           // Epilog
2939 
2940   // Jump to interpreter
2941   __ ret(0);
2942 
2943   // Make sure all code is generated
2944   masm->flush();
2945 
2946   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2947   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2948 #if INCLUDE_JVMCI
2949   if (EnableJVMCI) {
2950     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2951     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2952   }
2953 #endif
2954 }
2955 
2956 //------------------------------generate_handler_blob------
2957 //
2958 // Generate a special Compile2Runtime blob that saves all registers,
2959 // and setup oopmap.
2960 //
2961 SafepointBlob* SharedRuntime::generate_handler_blob(SharedStubId id, address call_ptr) {
2962   assert(StubRoutines::forward_exception_entry() != nullptr,
2963          "must be generated before");
2964   assert(is_polling_page_id(id), "expected a polling page stub id");
2965 
2966   ResourceMark rm;
2967   OopMapSet *oop_maps = new OopMapSet();
2968   OopMap* map;
2969 
2970   // Allocate space for the code.  Setup code generation tools.
2971   const char* name = SharedRuntime::stub_name(id);
2972   CodeBuffer buffer(name, 2548, 1024);
2973   MacroAssembler* masm = new MacroAssembler(&buffer);
2974 
2975   address start   = __ pc();
2976   address call_pc = nullptr;
2977   int frame_size_in_words;
2978   bool cause_return = (id == SharedStubId::polling_page_return_handler_id);
2979   bool save_wide_vectors = (id == SharedStubId::polling_page_vectors_safepoint_handler_id);
2980 
2981   // Make room for return address (or push it again)
2982   if (!cause_return) {
2983     __ push(rbx);
2984   }
2985 
2986   // Save registers, fpu state, and flags
2987   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
2988 
2989   // The following is basically a call_VM.  However, we need the precise
2990   // address of the call in order to generate an oopmap. Hence, we do all the
2991   // work ourselves.
2992 
2993   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
2994 
2995   // The return address must always be correct so that frame constructor never
2996   // sees an invalid pc.
2997 
2998   if (!cause_return) {
2999     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3000     // Additionally, rbx is a callee saved register and we can look at it later to determine
3001     // if someone changed the return address for us!
3002     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3003     __ movptr(Address(rbp, wordSize), rbx);
3004   }
3005 
3006   // Do the call
3007   __ mov(c_rarg0, r15_thread);
3008   __ call(RuntimeAddress(call_ptr));
3009 
3010   // Set an oopmap for the call site.  This oopmap will map all
3011   // oop-registers and debug-info registers as callee-saved.  This
3012   // will allow deoptimization at this safepoint to find all possible
3013   // debug-info recordings, as well as let GC find all oops.
3014 
3015   oop_maps->add_gc_map( __ pc() - start, map);
3016 
3017   Label noException;
3018 
3019   __ reset_last_Java_frame(false);
3020 
3021   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3022   __ jcc(Assembler::equal, noException);
3023 
3024   // Exception pending
3025 
3026   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3027 
3028   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3029 
3030   // No exception case
3031   __ bind(noException);
3032 
3033   Label no_adjust;
3034 #ifdef ASSERT
3035   Label bail;
3036 #endif
3037   if (!cause_return) {
3038     Label no_prefix, not_special, check_rex_prefix;
3039 
3040     // If our stashed return pc was modified by the runtime we avoid touching it
3041     __ cmpptr(rbx, Address(rbp, wordSize));
3042     __ jcc(Assembler::notEqual, no_adjust);
3043 
3044     // Skip over the poll instruction.
3045     // See NativeInstruction::is_safepoint_poll()
3046     // Possible encodings:
3047     //      85 00       test   %eax,(%rax)
3048     //      85 01       test   %eax,(%rcx)
3049     //      85 02       test   %eax,(%rdx)
3050     //      85 03       test   %eax,(%rbx)
3051     //      85 06       test   %eax,(%rsi)
3052     //      85 07       test   %eax,(%rdi)
3053     //
3054     //   41 85 00       test   %eax,(%r8)
3055     //   41 85 01       test   %eax,(%r9)
3056     //   41 85 02       test   %eax,(%r10)
3057     //   41 85 03       test   %eax,(%r11)
3058     //   41 85 06       test   %eax,(%r14)
3059     //   41 85 07       test   %eax,(%r15)
3060     //
3061     //      85 04 24    test   %eax,(%rsp)
3062     //   41 85 04 24    test   %eax,(%r12)
3063     //      85 45 00    test   %eax,0x0(%rbp)
3064     //   41 85 45 00    test   %eax,0x0(%r13)
3065     //
3066     // Notes:
3067     //  Format of legacy MAP0 test instruction:-
3068     //  [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
3069     //  o  For safepoint polling instruction "test %eax,(%rax)", encoding of first register
3070     //     operand and base register of memory operand is b/w [0-8), hence we do not require
3071     //     additional REX prefix where REX.B bit stores MSB bit of register encoding, which
3072     //     is why two bytes encoding is sufficient here.
3073     //  o  For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
3074     //     register of memory operand is 1000, thus we need additional REX prefix in this case,
3075     //     there by adding additional byte to instruction encoding.
3076     //  o  In case BASE register is one of the 32 extended GPR registers available only on targets
3077     //     supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
3078     //     most significant two bits of 5 bit register encoding.
3079 
3080     if (VM_Version::supports_apx_f()) {
3081       __ cmpb(Address(rbx, 0), Assembler::REX2);
3082       __ jccb(Assembler::notEqual, check_rex_prefix);
3083       __ addptr(rbx, 2);
3084       __ bind(check_rex_prefix);
3085     }
3086     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3087     __ jccb(Assembler::notEqual, no_prefix);
3088     __ addptr(rbx, 1);
3089     __ bind(no_prefix);
3090 #ifdef ASSERT
3091     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3092 #endif
3093     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3094     // r12/rsp 0x04
3095     // r13/rbp 0x05
3096     __ movzbq(rcx, Address(rbx, 1));
3097     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3098     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3099     __ cmpptr(rcx, 1);
3100     __ jccb(Assembler::above, not_special);
3101     __ addptr(rbx, 1);
3102     __ bind(not_special);
3103 #ifdef ASSERT
3104     // Verify the correct encoding of the poll we're about to skip.
3105     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3106     __ jcc(Assembler::notEqual, bail);
3107     // Mask out the modrm bits
3108     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3109     // rax encodes to 0, so if the bits are nonzero it's incorrect
3110     __ jcc(Assembler::notZero, bail);
3111 #endif
3112     // Adjust return pc forward to step over the safepoint poll instruction
3113     __ addptr(rbx, 2);
3114     __ movptr(Address(rbp, wordSize), rbx);
3115   }
3116 
3117   __ bind(no_adjust);
3118   // Normal exit, restore registers and exit.
3119   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3120   __ ret(0);
3121 
3122 #ifdef ASSERT
3123   __ bind(bail);
3124   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3125 #endif
3126 
3127   // Make sure all code is generated
3128   masm->flush();
3129 
3130   // Fill-out other meta info
3131   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3132 }
3133 
3134 //
3135 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3136 //
3137 // Generate a stub that calls into vm to find out the proper destination
3138 // of a java call. All the argument registers are live at this point
3139 // but since this is generic code we don't know what they are and the caller
3140 // must do any gc of the args.
3141 //
3142 RuntimeStub* SharedRuntime::generate_resolve_blob(SharedStubId id, address destination) {
3143   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3144   assert(is_resolve_id(id), "expected a resolve stub id");
3145 
3146   // allocate space for the code
3147   ResourceMark rm;
3148 
3149   const char* name = SharedRuntime::stub_name(id);
3150   CodeBuffer buffer(name, 1552, 512);
3151   MacroAssembler* masm = new MacroAssembler(&buffer);
3152 
3153   int frame_size_in_words;
3154 
3155   OopMapSet *oop_maps = new OopMapSet();
3156   OopMap* map = nullptr;
3157 
3158   int start = __ offset();
3159 
3160   // No need to save vector registers since they are caller-saved anyway.
3161   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3162 
3163   int frame_complete = __ offset();
3164 
3165   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3166 
3167   __ mov(c_rarg0, r15_thread);
3168 
3169   __ call(RuntimeAddress(destination));
3170 
3171 
3172   // Set an oopmap for the call site.
3173   // We need this not only for callee-saved registers, but also for volatile
3174   // registers that the compiler might be keeping live across a safepoint.
3175 
3176   oop_maps->add_gc_map( __ offset() - start, map);
3177 
3178   // rax contains the address we are going to jump to assuming no exception got installed
3179 
3180   // clear last_Java_sp
3181   __ reset_last_Java_frame(false);
3182   // check for pending exceptions
3183   Label pending;
3184   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3185   __ jcc(Assembler::notEqual, pending);
3186 
3187   // get the returned Method*
3188   __ get_vm_result_2(rbx);
3189   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3190 
3191   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3192 
3193   RegisterSaver::restore_live_registers(masm);
3194 
3195   // We are back to the original state on entry and ready to go.
3196 
3197   __ jmp(rax);
3198 
3199   // Pending exception after the safepoint
3200 
3201   __ bind(pending);
3202 
3203   RegisterSaver::restore_live_registers(masm);
3204 
3205   // exception pending => remove activation and forward to exception handler
3206 
3207   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3208 
3209   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3210   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3211 
3212   // -------------
3213   // make sure all code is generated
3214   masm->flush();
3215 
3216   // return the  blob
3217   // frame_size_words or bytes??
3218   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3219 }
3220 
3221 // Continuation point for throwing of implicit exceptions that are
3222 // not handled in the current activation. Fabricates an exception
3223 // oop and initiates normal exception dispatching in this
3224 // frame. Since we need to preserve callee-saved values (currently
3225 // only for C2, but done for C1 as well) we need a callee-saved oop
3226 // map and therefore have to make these stubs into RuntimeStubs
3227 // rather than BufferBlobs.  If the compiler needs all registers to
3228 // be preserved between the fault point and the exception handler
3229 // then it must assume responsibility for that in
3230 // AbstractCompiler::continuation_for_implicit_null_exception or
3231 // continuation_for_implicit_division_by_zero_exception. All other
3232 // implicit exceptions (e.g., NullPointerException or
3233 // AbstractMethodError on entry) are either at call sites or
3234 // otherwise assume that stack unwinding will be initiated, so
3235 // caller saved registers were assumed volatile in the compiler.
3236 RuntimeStub* SharedRuntime::generate_throw_exception(SharedStubId id, address runtime_entry) {
3237   assert(is_throw_id(id), "expected a throw stub id");
3238 
3239   const char* name = SharedRuntime::stub_name(id);
3240 
3241   // Information about frame layout at time of blocking runtime call.
3242   // Note that we only have to preserve callee-saved registers since
3243   // the compilers are responsible for supplying a continuation point
3244   // if they expect all registers to be preserved.
3245   enum layout {
3246     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3247     rbp_off2,
3248     return_off,
3249     return_off2,
3250     framesize // inclusive of return address
3251   };
3252 
3253   int insts_size = 512;
3254   int locs_size  = 64;
3255 
3256   ResourceMark rm;
3257   const char* timer_msg = "SharedRuntime generate_throw_exception";
3258   TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3259 
3260   CodeBuffer code(name, insts_size, locs_size);
3261   OopMapSet* oop_maps  = new OopMapSet();
3262   MacroAssembler* masm = new MacroAssembler(&code);
3263 
3264   address start = __ pc();
3265 
3266   // This is an inlined and slightly modified version of call_VM
3267   // which has the ability to fetch the return PC out of
3268   // thread-local storage and also sets up last_Java_sp slightly
3269   // differently than the real call_VM
3270 
3271   __ enter(); // required for proper stackwalking of RuntimeStub frame
3272 
3273   assert(is_even(framesize/2), "sp not 16-byte aligned");
3274 
3275   // return address and rbp are already in place
3276   __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3277 
3278   int frame_complete = __ pc() - start;
3279 
3280   // Set up last_Java_sp and last_Java_fp
3281   address the_pc = __ pc();
3282   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3283   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3284 
3285   // Call runtime
3286   __ movptr(c_rarg0, r15_thread);
3287   BLOCK_COMMENT("call runtime_entry");
3288   __ call(RuntimeAddress(runtime_entry));
3289 
3290   // Generate oop map
3291   OopMap* map = new OopMap(framesize, 0);
3292 
3293   oop_maps->add_gc_map(the_pc - start, map);
3294 
3295   __ reset_last_Java_frame(true);
3296 
3297   __ leave(); // required for proper stackwalking of RuntimeStub frame
3298 
3299   // check for pending exceptions
3300 #ifdef ASSERT
3301   Label L;
3302   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3303   __ jcc(Assembler::notEqual, L);
3304   __ should_not_reach_here();
3305   __ bind(L);
3306 #endif // ASSERT
3307   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3308 
3309 
3310   // codeBlob framesize is in words (not VMRegImpl::slot_size)
3311   RuntimeStub* stub =
3312     RuntimeStub::new_runtime_stub(name,
3313                                   &code,
3314                                   frame_complete,
3315                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3316                                   oop_maps, false);
3317   return stub;
3318 }
3319 
3320 //------------------------------Montgomery multiplication------------------------
3321 //
3322 
3323 #ifndef _WINDOWS
3324 
3325 // Subtract 0:b from carry:a.  Return carry.
3326 static julong
3327 sub(julong a[], julong b[], julong carry, long len) {
3328   long long i = 0, cnt = len;
3329   julong tmp;
3330   asm volatile("clc; "
3331                "0: ; "
3332                "mov (%[b], %[i], 8), %[tmp]; "
3333                "sbb %[tmp], (%[a], %[i], 8); "
3334                "inc %[i]; dec %[cnt]; "
3335                "jne 0b; "
3336                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3337                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3338                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3339                : "memory");
3340   return tmp;
3341 }
3342 
3343 // Multiply (unsigned) Long A by Long B, accumulating the double-
3344 // length result into the accumulator formed of T0, T1, and T2.
3345 #define MACC(A, B, T0, T1, T2)                                  \
3346 do {                                                            \
3347   unsigned long hi, lo;                                         \
3348   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3349            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3350            : "r"(A), "a"(B) : "cc");                            \
3351  } while(0)
3352 
3353 // As above, but add twice the double-length result into the
3354 // accumulator.
3355 #define MACC2(A, B, T0, T1, T2)                                 \
3356 do {                                                            \
3357   unsigned long hi, lo;                                         \
3358   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3359            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3360            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3361            : "r"(A), "a"(B) : "cc");                            \
3362  } while(0)
3363 
3364 #else //_WINDOWS
3365 
3366 static julong
3367 sub(julong a[], julong b[], julong carry, long len) {
3368   long i;
3369   julong tmp;
3370   unsigned char c = 1;
3371   for (i = 0; i < len; i++) {
3372     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3373     a[i] = tmp;
3374   }
3375   c = _addcarry_u64(c, carry, ~0, &tmp);
3376   return tmp;
3377 }
3378 
3379 // Multiply (unsigned) Long A by Long B, accumulating the double-
3380 // length result into the accumulator formed of T0, T1, and T2.
3381 #define MACC(A, B, T0, T1, T2)                          \
3382 do {                                                    \
3383   julong hi, lo;                            \
3384   lo = _umul128(A, B, &hi);                             \
3385   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3386   c = _addcarry_u64(c, hi, T1, &T1);                    \
3387   _addcarry_u64(c, T2, 0, &T2);                         \
3388  } while(0)
3389 
3390 // As above, but add twice the double-length result into the
3391 // accumulator.
3392 #define MACC2(A, B, T0, T1, T2)                         \
3393 do {                                                    \
3394   julong hi, lo;                            \
3395   lo = _umul128(A, B, &hi);                             \
3396   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3397   c = _addcarry_u64(c, hi, T1, &T1);                    \
3398   _addcarry_u64(c, T2, 0, &T2);                         \
3399   c = _addcarry_u64(0, lo, T0, &T0);                    \
3400   c = _addcarry_u64(c, hi, T1, &T1);                    \
3401   _addcarry_u64(c, T2, 0, &T2);                         \
3402  } while(0)
3403 
3404 #endif //_WINDOWS
3405 
3406 // Fast Montgomery multiplication.  The derivation of the algorithm is
3407 // in  A Cryptographic Library for the Motorola DSP56000,
3408 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3409 
3410 static void NOINLINE
3411 montgomery_multiply(julong a[], julong b[], julong n[],
3412                     julong m[], julong inv, int len) {
3413   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3414   int i;
3415 
3416   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3417 
3418   for (i = 0; i < len; i++) {
3419     int j;
3420     for (j = 0; j < i; j++) {
3421       MACC(a[j], b[i-j], t0, t1, t2);
3422       MACC(m[j], n[i-j], t0, t1, t2);
3423     }
3424     MACC(a[i], b[0], t0, t1, t2);
3425     m[i] = t0 * inv;
3426     MACC(m[i], n[0], t0, t1, t2);
3427 
3428     assert(t0 == 0, "broken Montgomery multiply");
3429 
3430     t0 = t1; t1 = t2; t2 = 0;
3431   }
3432 
3433   for (i = len; i < 2*len; i++) {
3434     int j;
3435     for (j = i-len+1; j < len; j++) {
3436       MACC(a[j], b[i-j], t0, t1, t2);
3437       MACC(m[j], n[i-j], t0, t1, t2);
3438     }
3439     m[i-len] = t0;
3440     t0 = t1; t1 = t2; t2 = 0;
3441   }
3442 
3443   while (t0)
3444     t0 = sub(m, n, t0, len);
3445 }
3446 
3447 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3448 // multiplies so it should be up to 25% faster than Montgomery
3449 // multiplication.  However, its loop control is more complex and it
3450 // may actually run slower on some machines.
3451 
3452 static void NOINLINE
3453 montgomery_square(julong a[], julong n[],
3454                   julong m[], julong inv, int len) {
3455   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3456   int i;
3457 
3458   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3459 
3460   for (i = 0; i < len; i++) {
3461     int j;
3462     int end = (i+1)/2;
3463     for (j = 0; j < end; j++) {
3464       MACC2(a[j], a[i-j], t0, t1, t2);
3465       MACC(m[j], n[i-j], t0, t1, t2);
3466     }
3467     if ((i & 1) == 0) {
3468       MACC(a[j], a[j], t0, t1, t2);
3469     }
3470     for (; j < i; j++) {
3471       MACC(m[j], n[i-j], t0, t1, t2);
3472     }
3473     m[i] = t0 * inv;
3474     MACC(m[i], n[0], t0, t1, t2);
3475 
3476     assert(t0 == 0, "broken Montgomery square");
3477 
3478     t0 = t1; t1 = t2; t2 = 0;
3479   }
3480 
3481   for (i = len; i < 2*len; i++) {
3482     int start = i-len+1;
3483     int end = start + (len - start)/2;
3484     int j;
3485     for (j = start; j < end; j++) {
3486       MACC2(a[j], a[i-j], t0, t1, t2);
3487       MACC(m[j], n[i-j], t0, t1, t2);
3488     }
3489     if ((i & 1) == 0) {
3490       MACC(a[j], a[j], t0, t1, t2);
3491     }
3492     for (; j < len; j++) {
3493       MACC(m[j], n[i-j], t0, t1, t2);
3494     }
3495     m[i-len] = t0;
3496     t0 = t1; t1 = t2; t2 = 0;
3497   }
3498 
3499   while (t0)
3500     t0 = sub(m, n, t0, len);
3501 }
3502 
3503 // Swap words in a longword.
3504 static julong swap(julong x) {
3505   return (x << 32) | (x >> 32);
3506 }
3507 
3508 // Copy len longwords from s to d, word-swapping as we go.  The
3509 // destination array is reversed.
3510 static void reverse_words(julong *s, julong *d, int len) {
3511   d += len;
3512   while(len-- > 0) {
3513     d--;
3514     *d = swap(*s);
3515     s++;
3516   }
3517 }
3518 
3519 // The threshold at which squaring is advantageous was determined
3520 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3521 #define MONTGOMERY_SQUARING_THRESHOLD 64
3522 
3523 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3524                                         jint len, jlong inv,
3525                                         jint *m_ints) {
3526   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3527   int longwords = len/2;
3528 
3529   // Make very sure we don't use so much space that the stack might
3530   // overflow.  512 jints corresponds to an 16384-bit integer and
3531   // will use here a total of 8k bytes of stack space.
3532   int divisor = sizeof(julong) * 4;
3533   guarantee(longwords <= 8192 / divisor, "must be");
3534   int total_allocation = longwords * sizeof (julong) * 4;
3535   julong *scratch = (julong *)alloca(total_allocation);
3536 
3537   // Local scratch arrays
3538   julong
3539     *a = scratch + 0 * longwords,
3540     *b = scratch + 1 * longwords,
3541     *n = scratch + 2 * longwords,
3542     *m = scratch + 3 * longwords;
3543 
3544   reverse_words((julong *)a_ints, a, longwords);
3545   reverse_words((julong *)b_ints, b, longwords);
3546   reverse_words((julong *)n_ints, n, longwords);
3547 
3548   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3549 
3550   reverse_words(m, (julong *)m_ints, longwords);
3551 }
3552 
3553 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3554                                       jint len, jlong inv,
3555                                       jint *m_ints) {
3556   assert(len % 2 == 0, "array length in montgomery_square must be even");
3557   int longwords = len/2;
3558 
3559   // Make very sure we don't use so much space that the stack might
3560   // overflow.  512 jints corresponds to an 16384-bit integer and
3561   // will use here a total of 6k bytes of stack space.
3562   int divisor = sizeof(julong) * 3;
3563   guarantee(longwords <= (8192 / divisor), "must be");
3564   int total_allocation = longwords * sizeof (julong) * 3;
3565   julong *scratch = (julong *)alloca(total_allocation);
3566 
3567   // Local scratch arrays
3568   julong
3569     *a = scratch + 0 * longwords,
3570     *n = scratch + 1 * longwords,
3571     *m = scratch + 2 * longwords;
3572 
3573   reverse_words((julong *)a_ints, a, longwords);
3574   reverse_words((julong *)n_ints, n, longwords);
3575 
3576   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3577     ::montgomery_square(a, n, m, (julong)inv, longwords);
3578   } else {
3579     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3580   }
3581 
3582   reverse_words(m, (julong *)m_ints, longwords);
3583 }
3584 
3585 #if INCLUDE_JFR
3586 
3587 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3588 // It returns a jobject handle to the event writer.
3589 // The handle is dereferenced and the return value is the event writer oop.
3590 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3591   enum layout {
3592     rbp_off,
3593     rbpH_off,
3594     return_off,
3595     return_off2,
3596     framesize // inclusive of return address
3597   };
3598 
3599   const char* name = SharedRuntime::stub_name(SharedStubId::jfr_write_checkpoint_id);
3600   CodeBuffer code(name, 1024, 64);
3601   MacroAssembler* masm = new MacroAssembler(&code);
3602   address start = __ pc();
3603 
3604   __ enter();
3605   address the_pc = __ pc();
3606 
3607   int frame_complete = the_pc - start;
3608 
3609   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3610   __ movptr(c_rarg0, r15_thread);
3611   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3612   __ reset_last_Java_frame(true);
3613 
3614   // rax is jobject handle result, unpack and process it through a barrier.
3615   __ resolve_global_jobject(rax, c_rarg0);
3616 
3617   __ leave();
3618   __ ret(0);
3619 
3620   OopMapSet* oop_maps = new OopMapSet();
3621   OopMap* map = new OopMap(framesize, 1);
3622   oop_maps->add_gc_map(frame_complete, map);
3623 
3624   RuntimeStub* stub =
3625     RuntimeStub::new_runtime_stub(name,
3626                                   &code,
3627                                   frame_complete,
3628                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3629                                   oop_maps,
3630                                   false);
3631   return stub;
3632 }
3633 
3634 // For c2: call to return a leased buffer.
3635 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
3636   enum layout {
3637     rbp_off,
3638     rbpH_off,
3639     return_off,
3640     return_off2,
3641     framesize // inclusive of return address
3642   };
3643 
3644   const char* name = SharedRuntime::stub_name(SharedStubId::jfr_return_lease_id);
3645   CodeBuffer code(name, 1024, 64);
3646   MacroAssembler* masm = new MacroAssembler(&code);
3647   address start = __ pc();
3648 
3649   __ enter();
3650   address the_pc = __ pc();
3651 
3652   int frame_complete = the_pc - start;
3653 
3654   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
3655   __ movptr(c_rarg0, r15_thread);
3656   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
3657   __ reset_last_Java_frame(true);
3658 
3659   __ leave();
3660   __ ret(0);
3661 
3662   OopMapSet* oop_maps = new OopMapSet();
3663   OopMap* map = new OopMap(framesize, 1);
3664   oop_maps->add_gc_map(frame_complete, map);
3665 
3666   RuntimeStub* stub =
3667     RuntimeStub::new_runtime_stub(name,
3668                                   &code,
3669                                   frame_complete,
3670                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3671                                   oop_maps,
3672                                   false);
3673   return stub;
3674 }
3675 
3676 #endif // INCLUDE_JFR
3677