1 /*
   2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef _WINDOWS
  26 #include "alloca.h"
  27 #endif
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "code/aotCodeCache.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/method.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/continuation.hpp"
  48 #include "runtime/continuationEntry.inline.hpp"
  49 #include "runtime/globals.hpp"
  50 #include "runtime/jniHandles.hpp"
  51 #include "runtime/safepointMechanism.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/signature.hpp"
  54 #include "runtime/stubRoutines.hpp"
  55 #include "runtime/timerTrace.hpp"
  56 #include "runtime/vframeArray.hpp"
  57 #include "runtime/vm_version.hpp"
  58 #include "utilities/align.hpp"
  59 #include "utilities/checkedCast.hpp"
  60 #include "utilities/formatBuffer.hpp"
  61 #include "vmreg_x86.inline.hpp"
  62 #ifdef COMPILER1
  63 #include "c1/c1_Runtime1.hpp"
  64 #endif
  65 #ifdef COMPILER2
  66 #include "opto/runtime.hpp"
  67 #endif
  68 #if INCLUDE_JVMCI
  69 #include "jvmci/jvmciJavaClasses.hpp"
  70 #endif
  71 
  72 #define __ masm->
  73 
  74 #ifdef PRODUCT
  75 #define BLOCK_COMMENT(str) /* nothing */
  76 #else
  77 #define BLOCK_COMMENT(str) __ block_comment(str)
  78 #endif // PRODUCT
  79 
  80 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  81 
  82 class RegisterSaver {
  83   // Capture info about frame layout.  Layout offsets are in jint
  84   // units because compiler frame slots are jints.
  85 #define XSAVE_AREA_BEGIN 160
  86 #define XSAVE_AREA_YMM_BEGIN 576
  87 #define XSAVE_AREA_EGPRS 960
  88 #define XSAVE_AREA_OPMASK_BEGIN 1088
  89 #define XSAVE_AREA_ZMM_BEGIN 1152
  90 #define XSAVE_AREA_UPPERBANK 1664
  91 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  92 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  93 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
  94 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
  95 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
  96   enum layout {
  97     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
  98     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
  99     DEF_XMM_OFFS(0),
 100     DEF_XMM_OFFS(1),
 101     // 2..15 are implied in range usage
 102     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 103     DEF_YMM_OFFS(0),
 104     DEF_YMM_OFFS(1),
 105     r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 106     r16H_off,
 107     r17_off, r17H_off,
 108     r18_off, r18H_off,
 109     r19_off, r19H_off,
 110     r20_off, r20H_off,
 111     r21_off, r21H_off,
 112     r22_off, r22H_off,
 113     r23_off, r23H_off,
 114     r24_off, r24H_off,
 115     r25_off, r25H_off,
 116     r26_off, r26H_off,
 117     r27_off, r27H_off,
 118     r28_off, r28H_off,
 119     r29_off, r29H_off,
 120     r30_off, r30H_off,
 121     r31_off, r31H_off,
 122     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 123     DEF_OPMASK_OFFS(0),
 124     DEF_OPMASK_OFFS(1),
 125     // 2..7 are implied in range usage
 126     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 127     DEF_ZMM_OFFS(0),
 128     DEF_ZMM_OFFS(1),
 129     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 130     DEF_ZMM_UPPER_OFFS(16),
 131     DEF_ZMM_UPPER_OFFS(17),
 132     // 18..31 are implied in range usage
 133     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 134     fpu_stateH_end,
 135     r15_off, r15H_off,
 136     r14_off, r14H_off,
 137     r13_off, r13H_off,
 138     r12_off, r12H_off,
 139     r11_off, r11H_off,
 140     r10_off, r10H_off,
 141     r9_off,  r9H_off,
 142     r8_off,  r8H_off,
 143     rdi_off, rdiH_off,
 144     rsi_off, rsiH_off,
 145     ignore_off, ignoreH_off,  // extra copy of rbp
 146     rsp_off, rspH_off,
 147     rbx_off, rbxH_off,
 148     rdx_off, rdxH_off,
 149     rcx_off, rcxH_off,
 150     rax_off, raxH_off,
 151     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 152     align_off, alignH_off,
 153     flags_off, flagsH_off,
 154     // The frame sender code expects that rbp will be in the "natural" place and
 155     // will override any oopMap setting for it. We must therefore force the layout
 156     // so that it agrees with the frame sender code.
 157     rbp_off, rbpH_off,        // copy of rbp we will restore
 158     return_off, returnH_off,  // slot for return address
 159     reg_save_size             // size in compiler stack slots
 160   };
 161 
 162  public:
 163   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 164   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 165 
 166   // Offsets into the register save area
 167   // Used by deoptimization when it is managing result register
 168   // values on its own
 169 
 170   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 171   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 172   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 173   static int r15_offset_in_bytes(void)    { return BytesPerInt * r15_off; }
 174   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 175   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 176 
 177   // During deoptimization only the result registers need to be restored,
 178   // all the other values have already been extracted.
 179   static void restore_result_registers(MacroAssembler* masm);
 180 };
 181 
 182 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 183   int off = 0;
 184   int num_xmm_regs = XMMRegister::available_xmm_registers();
 185 #if COMPILER2_OR_JVMCI
 186   if (save_wide_vectors && UseAVX == 0) {
 187     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 188   }
 189   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 190 #else
 191   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 192 #endif
 193 
 194   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 195   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 196   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 197   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 198   // CodeBlob frame size is in words.
 199   int frame_size_in_words = frame_size_in_bytes / wordSize;
 200   *total_frame_words = frame_size_in_words;
 201 
 202   // Save registers, fpu state, and flags.
 203   // We assume caller has already pushed the return address onto the
 204   // stack, so rsp is 8-byte aligned here.
 205   // We push rpb twice in this sequence because we want the real rbp
 206   // to be under the return like a normal enter.
 207 
 208   __ enter();          // rsp becomes 16-byte aligned here
 209   __ pushf();
 210   // Make sure rsp stays 16-byte aligned
 211   __ subq(rsp, 8);
 212   // Push CPU state in multiple of 16 bytes
 213   __ save_legacy_gprs();
 214   __ push_FPU_state();
 215 
 216 
 217   // push cpu state handles this on EVEX enabled targets
 218   if (save_wide_vectors) {
 219     // Save upper half of YMM registers(0..15)
 220     int base_addr = XSAVE_AREA_YMM_BEGIN;
 221     for (int n = 0; n < 16; n++) {
 222       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 223     }
 224     if (VM_Version::supports_evex()) {
 225       // Save upper half of ZMM registers(0..15)
 226       base_addr = XSAVE_AREA_ZMM_BEGIN;
 227       for (int n = 0; n < 16; n++) {
 228         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 229       }
 230       // Save full ZMM registers(16..num_xmm_regs)
 231       base_addr = XSAVE_AREA_UPPERBANK;
 232       off = 0;
 233       int vector_len = Assembler::AVX_512bit;
 234       for (int n = 16; n < num_xmm_regs; n++) {
 235         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 236       }
 237 #if COMPILER2_OR_JVMCI
 238       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 239       off = 0;
 240       for(int n = 0; n < KRegister::number_of_registers; n++) {
 241         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 242       }
 243 #endif
 244     }
 245   } else {
 246     if (VM_Version::supports_evex()) {
 247       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 248       int base_addr = XSAVE_AREA_UPPERBANK;
 249       off = 0;
 250       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 251       for (int n = 16; n < num_xmm_regs; n++) {
 252         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 253       }
 254 #if COMPILER2_OR_JVMCI
 255       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 256       off = 0;
 257       for(int n = 0; n < KRegister::number_of_registers; n++) {
 258         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 259       }
 260 #endif
 261     }
 262   }
 263 
 264 #if COMPILER2_OR_JVMCI
 265   if (UseAPX) {
 266       int base_addr = XSAVE_AREA_EGPRS;
 267       off = 0;
 268       for (int n = 16; n < Register::number_of_registers; n++) {
 269         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 270       }
 271   }
 272 #endif
 273 
 274   __ vzeroupper();
 275   if (frame::arg_reg_save_area_bytes != 0) {
 276     // Allocate argument register save area
 277     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 278   }
 279 
 280   // Set an oopmap for the call site.  This oopmap will map all
 281   // oop-registers and debug-info registers as callee-saved.  This
 282   // will allow deoptimization at this safepoint to find all possible
 283   // debug-info recordings, as well as let GC find all oops.
 284 
 285   OopMapSet *oop_maps = new OopMapSet();
 286   OopMap* map = new OopMap(frame_size_in_slots, 0);
 287 
 288 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 289 
 290   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 291   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 292   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 293   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 294   // rbp location is known implicitly by the frame sender code, needs no oopmap
 295   // and the location where rbp was saved by is ignored
 296   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 297   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 298   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 299   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 300   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 301   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 302   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 303   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 304   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 305   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 306 
 307   if (UseAPX) {
 308     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 309     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 310     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 311     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 312     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 313     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 314     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 315     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 316     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 317     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 318     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 319     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 320     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 321     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 322     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 323     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 324   }
 325   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 326   // on EVEX enabled targets, we get it included in the xsave area
 327   off = xmm0_off;
 328   int delta = xmm1_off - off;
 329   for (int n = 0; n < 16; n++) {
 330     XMMRegister xmm_name = as_XMMRegister(n);
 331     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 332     off += delta;
 333   }
 334   if (UseAVX > 2) {
 335     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 336     off = zmm16_off;
 337     delta = zmm17_off - off;
 338     for (int n = 16; n < num_xmm_regs; n++) {
 339       XMMRegister zmm_name = as_XMMRegister(n);
 340       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 341       off += delta;
 342     }
 343   }
 344 
 345 #if COMPILER2_OR_JVMCI
 346   if (save_wide_vectors) {
 347     // Save upper half of YMM registers(0..15)
 348     off = ymm0_off;
 349     delta = ymm1_off - ymm0_off;
 350     for (int n = 0; n < 16; n++) {
 351       XMMRegister ymm_name = as_XMMRegister(n);
 352       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 353       off += delta;
 354     }
 355     if (VM_Version::supports_evex()) {
 356       // Save upper half of ZMM registers(0..15)
 357       off = zmm0_off;
 358       delta = zmm1_off - zmm0_off;
 359       for (int n = 0; n < 16; n++) {
 360         XMMRegister zmm_name = as_XMMRegister(n);
 361         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 362         off += delta;
 363       }
 364     }
 365   }
 366 #endif // COMPILER2_OR_JVMCI
 367 
 368   // %%% These should all be a waste but we'll keep things as they were for now
 369   if (true) {
 370     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 371     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 372     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 373     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 374     // rbp location is known implicitly by the frame sender code, needs no oopmap
 375     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 376     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 377     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 378     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 379     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 380     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 381     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 382     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 383     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 384     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 385     if (UseAPX) {
 386       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 387       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 388       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 389       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 390       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 391       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 392       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 393       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 394       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 395       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 396       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 397       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 398       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 399       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 400       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 401       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 402     }
 403     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 404     // on EVEX enabled targets, we get it included in the xsave area
 405     off = xmm0H_off;
 406     delta = xmm1H_off - off;
 407     for (int n = 0; n < 16; n++) {
 408       XMMRegister xmm_name = as_XMMRegister(n);
 409       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 410       off += delta;
 411     }
 412     if (UseAVX > 2) {
 413       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 414       off = zmm16H_off;
 415       delta = zmm17H_off - off;
 416       for (int n = 16; n < num_xmm_regs; n++) {
 417         XMMRegister zmm_name = as_XMMRegister(n);
 418         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 419         off += delta;
 420       }
 421     }
 422   }
 423 
 424   return map;
 425 }
 426 
 427 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 428   int num_xmm_regs = XMMRegister::available_xmm_registers();
 429   if (frame::arg_reg_save_area_bytes != 0) {
 430     // Pop arg register save area
 431     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 432   }
 433 
 434 #if COMPILER2_OR_JVMCI
 435   if (restore_wide_vectors) {
 436     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 437     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 438   }
 439 #else
 440   assert(!restore_wide_vectors, "vectors are generated only by C2");
 441 #endif
 442 
 443   __ vzeroupper();
 444 
 445   // On EVEX enabled targets everything is handled in pop fpu state
 446   if (restore_wide_vectors) {
 447     // Restore upper half of YMM registers (0..15)
 448     int base_addr = XSAVE_AREA_YMM_BEGIN;
 449     for (int n = 0; n < 16; n++) {
 450       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 451     }
 452     if (VM_Version::supports_evex()) {
 453       // Restore upper half of ZMM registers (0..15)
 454       base_addr = XSAVE_AREA_ZMM_BEGIN;
 455       for (int n = 0; n < 16; n++) {
 456         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 457       }
 458       // Restore full ZMM registers(16..num_xmm_regs)
 459       base_addr = XSAVE_AREA_UPPERBANK;
 460       int vector_len = Assembler::AVX_512bit;
 461       int off = 0;
 462       for (int n = 16; n < num_xmm_regs; n++) {
 463         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 464       }
 465 #if COMPILER2_OR_JVMCI
 466       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 467       off = 0;
 468       for (int n = 0; n < KRegister::number_of_registers; n++) {
 469         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 470       }
 471 #endif
 472     }
 473   } else {
 474     if (VM_Version::supports_evex()) {
 475       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 476       int base_addr = XSAVE_AREA_UPPERBANK;
 477       int off = 0;
 478       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 479       for (int n = 16; n < num_xmm_regs; n++) {
 480         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 481       }
 482 #if COMPILER2_OR_JVMCI
 483       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 484       off = 0;
 485       for (int n = 0; n < KRegister::number_of_registers; n++) {
 486         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 487       }
 488 #endif
 489     }
 490   }
 491 
 492 #if COMPILER2_OR_JVMCI
 493   if (UseAPX) {
 494     int base_addr = XSAVE_AREA_EGPRS;
 495     int off = 0;
 496     for (int n = 16; n < Register::number_of_registers; n++) {
 497       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 498     }
 499   }
 500 #endif
 501 
 502   // Recover CPU state
 503   __ pop_FPU_state();
 504   __ restore_legacy_gprs();
 505   __ addq(rsp, 8);
 506   __ popf();
 507   // Get the rbp described implicitly by the calling convention (no oopMap)
 508   __ pop(rbp);
 509 }
 510 
 511 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 512 
 513   // Just restore result register. Only used by deoptimization. By
 514   // now any callee save register that needs to be restored to a c2
 515   // caller of the deoptee has been extracted into the vframeArray
 516   // and will be stuffed into the c2i adapter we create for later
 517   // restoration so only result registers need to be restored here.
 518 
 519   // Restore fp result register
 520   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 521   // Restore integer result register
 522   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 523   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 524 
 525   // Pop all of the register save are off the stack except the return address
 526   __ addptr(rsp, return_offset_in_bytes());
 527 }
 528 
 529 // Is vector's size (in bytes) bigger than a size saved by default?
 530 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 531 bool SharedRuntime::is_wide_vector(int size) {
 532   return size > 16;
 533 }
 534 
 535 // ---------------------------------------------------------------------------
 536 // Read the array of BasicTypes from a signature, and compute where the
 537 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 538 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 539 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 540 // as framesizes are fixed.
 541 // VMRegImpl::stack0 refers to the first slot 0(sp).
 542 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 543 // Register up to Register::number_of_registers are the 64-bit
 544 // integer registers.
 545 
 546 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 547 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 548 // units regardless of build. Of course for i486 there is no 64 bit build
 549 
 550 // The Java calling convention is a "shifted" version of the C ABI.
 551 // By skipping the first C ABI register we can call non-static jni methods
 552 // with small numbers of arguments without having to shuffle the arguments
 553 // at all. Since we control the java ABI we ought to at least get some
 554 // advantage out of it.
 555 
 556 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 557                                            VMRegPair *regs,
 558                                            int total_args_passed) {
 559 
 560   // Create the mapping between argument positions and
 561   // registers.
 562   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 563     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 564   };
 565   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 566     j_farg0, j_farg1, j_farg2, j_farg3,
 567     j_farg4, j_farg5, j_farg6, j_farg7
 568   };
 569 
 570 
 571   uint int_args = 0;
 572   uint fp_args = 0;
 573   uint stk_args = 0;
 574 
 575   for (int i = 0; i < total_args_passed; i++) {
 576     switch (sig_bt[i]) {
 577     case T_BOOLEAN:
 578     case T_CHAR:
 579     case T_BYTE:
 580     case T_SHORT:
 581     case T_INT:
 582       if (int_args < Argument::n_int_register_parameters_j) {
 583         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 584       } else {
 585         stk_args = align_up(stk_args, 2);
 586         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 587         stk_args += 1;
 588       }
 589       break;
 590     case T_VOID:
 591       // halves of T_LONG or T_DOUBLE
 592       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 593       regs[i].set_bad();
 594       break;
 595     case T_LONG:
 596       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 597       // fall through
 598     case T_OBJECT:
 599     case T_ARRAY:
 600     case T_ADDRESS:
 601       if (int_args < Argument::n_int_register_parameters_j) {
 602         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 603       } else {
 604         stk_args = align_up(stk_args, 2);
 605         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 606         stk_args += 2;
 607       }
 608       break;
 609     case T_FLOAT:
 610       if (fp_args < Argument::n_float_register_parameters_j) {
 611         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 612       } else {
 613         stk_args = align_up(stk_args, 2);
 614         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 615         stk_args += 1;
 616       }
 617       break;
 618     case T_DOUBLE:
 619       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 620       if (fp_args < Argument::n_float_register_parameters_j) {
 621         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 622       } else {
 623         stk_args = align_up(stk_args, 2);
 624         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 625         stk_args += 2;
 626       }
 627       break;
 628     default:
 629       ShouldNotReachHere();
 630       break;
 631     }
 632   }
 633 
 634   return stk_args;
 635 }
 636 
 637 // Patch the callers callsite with entry to compiled code if it exists.
 638 static void patch_callers_callsite(MacroAssembler *masm) {
 639   Label L;
 640   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 641   __ jcc(Assembler::equal, L);
 642 
 643   // Save the current stack pointer
 644   __ mov(r13, rsp);
 645   // Schedule the branch target address early.
 646   // Call into the VM to patch the caller, then jump to compiled callee
 647   // rax isn't live so capture return address while we easily can
 648   __ movptr(rax, Address(rsp, 0));
 649 
 650   // align stack so push_CPU_state doesn't fault
 651   __ andptr(rsp, -(StackAlignmentInBytes));
 652   __ push_CPU_state();
 653   __ vzeroupper();
 654   // VM needs caller's callsite
 655   // VM needs target method
 656   // This needs to be a long call since we will relocate this adapter to
 657   // the codeBuffer and it may not reach
 658 
 659   // Allocate argument register save area
 660   if (frame::arg_reg_save_area_bytes != 0) {
 661     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 662   }
 663   __ mov(c_rarg0, rbx);
 664   __ mov(c_rarg1, rax);
 665   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 666 
 667   // De-allocate argument register save area
 668   if (frame::arg_reg_save_area_bytes != 0) {
 669     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 670   }
 671 
 672   __ vzeroupper();
 673   __ pop_CPU_state();
 674   // restore sp
 675   __ mov(rsp, r13);
 676   __ bind(L);
 677 }
 678 
 679 static void gen_c2i_adapter(MacroAssembler *masm,
 680                             int total_args_passed,
 681                             int comp_args_on_stack,
 682                             const BasicType *sig_bt,
 683                             const VMRegPair *regs,
 684                             Label& skip_fixup) {
 685   // Before we get into the guts of the C2I adapter, see if we should be here
 686   // at all.  We've come from compiled code and are attempting to jump to the
 687   // interpreter, which means the caller made a static call to get here
 688   // (vcalls always get a compiled target if there is one).  Check for a
 689   // compiled target.  If there is one, we need to patch the caller's call.
 690   patch_callers_callsite(masm);
 691 
 692   __ bind(skip_fixup);
 693 
 694   // Since all args are passed on the stack, total_args_passed *
 695   // Interpreter::stackElementSize is the space we need.
 696 
 697   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 698 
 699   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 700 
 701   // stack is aligned, keep it that way
 702   // This is not currently needed or enforced by the interpreter, but
 703   // we might as well conform to the ABI.
 704   extraspace = align_up(extraspace, 2*wordSize);
 705 
 706   // set senderSP value
 707   __ lea(r13, Address(rsp, wordSize));
 708 
 709 #ifdef ASSERT
 710   __ check_stack_alignment(r13, "sender stack not aligned");
 711 #endif
 712   if (extraspace > 0) {
 713     // Pop the return address
 714     __ pop(rax);
 715 
 716     __ subptr(rsp, extraspace);
 717 
 718     // Push the return address
 719     __ push(rax);
 720 
 721     // Account for the return address location since we store it first rather
 722     // than hold it in a register across all the shuffling
 723     extraspace += wordSize;
 724   }
 725 
 726 #ifdef ASSERT
 727   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 728 #endif
 729 
 730   // Now write the args into the outgoing interpreter space
 731   for (int i = 0; i < total_args_passed; i++) {
 732     if (sig_bt[i] == T_VOID) {
 733       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 734       continue;
 735     }
 736 
 737     // offset to start parameters
 738     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 739     int next_off = st_off - Interpreter::stackElementSize;
 740 
 741     // Say 4 args:
 742     // i   st_off
 743     // 0   32 T_LONG
 744     // 1   24 T_VOID
 745     // 2   16 T_OBJECT
 746     // 3    8 T_BOOL
 747     // -    0 return address
 748     //
 749     // However to make thing extra confusing. Because we can fit a long/double in
 750     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 751     // leaves one slot empty and only stores to a single slot. In this case the
 752     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 753 
 754     VMReg r_1 = regs[i].first();
 755     VMReg r_2 = regs[i].second();
 756     if (!r_1->is_valid()) {
 757       assert(!r_2->is_valid(), "");
 758       continue;
 759     }
 760     if (r_1->is_stack()) {
 761       // memory to memory use rax
 762       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 763       if (!r_2->is_valid()) {
 764         // sign extend??
 765         __ movl(rax, Address(rsp, ld_off));
 766         __ movptr(Address(rsp, st_off), rax);
 767 
 768       } else {
 769 
 770         __ movq(rax, Address(rsp, ld_off));
 771 
 772         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 773         // T_DOUBLE and T_LONG use two slots in the interpreter
 774         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 775           // ld_off == LSW, ld_off+wordSize == MSW
 776           // st_off == MSW, next_off == LSW
 777           __ movq(Address(rsp, next_off), rax);
 778 #ifdef ASSERT
 779           // Overwrite the unused slot with known junk
 780           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 781           __ movptr(Address(rsp, st_off), rax);
 782 #endif /* ASSERT */
 783         } else {
 784           __ movq(Address(rsp, st_off), rax);
 785         }
 786       }
 787     } else if (r_1->is_Register()) {
 788       Register r = r_1->as_Register();
 789       if (!r_2->is_valid()) {
 790         // must be only an int (or less ) so move only 32bits to slot
 791         // why not sign extend??
 792         __ movl(Address(rsp, st_off), r);
 793       } else {
 794         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 795         // T_DOUBLE and T_LONG use two slots in the interpreter
 796         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 797           // long/double in gpr
 798 #ifdef ASSERT
 799           // Overwrite the unused slot with known junk
 800           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 801           __ movptr(Address(rsp, st_off), rax);
 802 #endif /* ASSERT */
 803           __ movq(Address(rsp, next_off), r);
 804         } else {
 805           __ movptr(Address(rsp, st_off), r);
 806         }
 807       }
 808     } else {
 809       assert(r_1->is_XMMRegister(), "");
 810       if (!r_2->is_valid()) {
 811         // only a float use just part of the slot
 812         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 813       } else {
 814 #ifdef ASSERT
 815         // Overwrite the unused slot with known junk
 816         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 817         __ movptr(Address(rsp, st_off), rax);
 818 #endif /* ASSERT */
 819         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 820       }
 821     }
 822   }
 823 
 824   // Schedule the branch target address early.
 825   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 826   __ jmp(rcx);
 827 }
 828 
 829 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 830                                     int total_args_passed,
 831                                     int comp_args_on_stack,
 832                                     const BasicType *sig_bt,
 833                                     const VMRegPair *regs) {
 834 
 835   // Note: r13 contains the senderSP on entry. We must preserve it since
 836   // we may do a i2c -> c2i transition if we lose a race where compiled
 837   // code goes non-entrant while we get args ready.
 838   // In addition we use r13 to locate all the interpreter args as
 839   // we must align the stack to 16 bytes on an i2c entry else we
 840   // lose alignment we expect in all compiled code and register
 841   // save code can segv when fxsave instructions find improperly
 842   // aligned stack pointer.
 843 
 844   // Adapters can be frameless because they do not require the caller
 845   // to perform additional cleanup work, such as correcting the stack pointer.
 846   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 847   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 848   // even if a callee has modified the stack pointer.
 849   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 850   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 851   // up via the senderSP register).
 852   // In other words, if *either* the caller or callee is interpreted, we can
 853   // get the stack pointer repaired after a call.
 854   // This is why c2i and i2c adapters cannot be indefinitely composed.
 855   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 856   // both caller and callee would be compiled methods, and neither would
 857   // clean up the stack pointer changes performed by the two adapters.
 858   // If this happens, control eventually transfers back to the compiled
 859   // caller, but with an uncorrected stack, causing delayed havoc.
 860 
 861   // Must preserve original SP for loading incoming arguments because
 862   // we need to align the outgoing SP for compiled code.
 863   __ movptr(r11, rsp);
 864 
 865   // Pick up the return address
 866   __ pop(rax);
 867 
 868   // Convert 4-byte c2 stack slots to words.
 869   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 870 
 871   if (comp_args_on_stack) {
 872     __ subptr(rsp, comp_words_on_stack * wordSize);
 873   }
 874 
 875   // Ensure compiled code always sees stack at proper alignment
 876   __ andptr(rsp, -16);
 877 
 878   // push the return address and misalign the stack that youngest frame always sees
 879   // as far as the placement of the call instruction
 880   __ push(rax);
 881 
 882   // Put saved SP in another register
 883   const Register saved_sp = rax;
 884   __ movptr(saved_sp, r11);
 885 
 886   // Will jump to the compiled code just as if compiled code was doing it.
 887   // Pre-load the register-jump target early, to schedule it better.
 888   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 889 
 890 #if INCLUDE_JVMCI
 891   if (EnableJVMCI) {
 892     // check if this call should be routed towards a specific entry point
 893     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 894     Label no_alternative_target;
 895     __ jcc(Assembler::equal, no_alternative_target);
 896     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 897     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 898     __ bind(no_alternative_target);
 899   }
 900 #endif // INCLUDE_JVMCI
 901 
 902   // Now generate the shuffle code.  Pick up all register args and move the
 903   // rest through the floating point stack top.
 904   for (int i = 0; i < total_args_passed; i++) {
 905     if (sig_bt[i] == T_VOID) {
 906       // Longs and doubles are passed in native word order, but misaligned
 907       // in the 32-bit build.
 908       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 909       continue;
 910     }
 911 
 912     // Pick up 0, 1 or 2 words from SP+offset.
 913 
 914     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 915             "scrambled load targets?");
 916     // Load in argument order going down.
 917     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 918     // Point to interpreter value (vs. tag)
 919     int next_off = ld_off - Interpreter::stackElementSize;
 920     //
 921     //
 922     //
 923     VMReg r_1 = regs[i].first();
 924     VMReg r_2 = regs[i].second();
 925     if (!r_1->is_valid()) {
 926       assert(!r_2->is_valid(), "");
 927       continue;
 928     }
 929     if (r_1->is_stack()) {
 930       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 931       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 932 
 933       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 934       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 935       // will be generated.
 936       if (!r_2->is_valid()) {
 937         // sign extend???
 938         __ movl(r13, Address(saved_sp, ld_off));
 939         __ movptr(Address(rsp, st_off), r13);
 940       } else {
 941         //
 942         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 943         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 944         // So we must adjust where to pick up the data to match the interpreter.
 945         //
 946         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 947         // are accessed as negative so LSW is at LOW address
 948 
 949         // ld_off is MSW so get LSW
 950         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 951                            next_off : ld_off;
 952         __ movq(r13, Address(saved_sp, offset));
 953         // st_off is LSW (i.e. reg.first())
 954         __ movq(Address(rsp, st_off), r13);
 955       }
 956     } else if (r_1->is_Register()) {  // Register argument
 957       Register r = r_1->as_Register();
 958       assert(r != rax, "must be different");
 959       if (r_2->is_valid()) {
 960         //
 961         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 962         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 963         // So we must adjust where to pick up the data to match the interpreter.
 964 
 965         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 966                            next_off : ld_off;
 967 
 968         // this can be a misaligned move
 969         __ movq(r, Address(saved_sp, offset));
 970       } else {
 971         // sign extend and use a full word?
 972         __ movl(r, Address(saved_sp, ld_off));
 973       }
 974     } else {
 975       if (!r_2->is_valid()) {
 976         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 977       } else {
 978         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 979       }
 980     }
 981   }
 982 
 983   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
 984 
 985   // 6243940 We might end up in handle_wrong_method if
 986   // the callee is deoptimized as we race thru here. If that
 987   // happens we don't want to take a safepoint because the
 988   // caller frame will look interpreted and arguments are now
 989   // "compiled" so it is much better to make this transition
 990   // invisible to the stack walking code. Unfortunately if
 991   // we try and find the callee by normal means a safepoint
 992   // is possible. So we stash the desired callee in the thread
 993   // and the vm will find there should this case occur.
 994 
 995   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 996 
 997   // put Method* where a c2i would expect should we end up there
 998   // only needed because eof c2 resolve stubs return Method* as a result in
 999   // rax
1000   __ mov(rax, rbx);
1001   __ jmp(r11);
1002 }
1003 
1004 // ---------------------------------------------------------------
1005 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
1006                                             int total_args_passed,
1007                                             int comp_args_on_stack,
1008                                             const BasicType *sig_bt,
1009                                             const VMRegPair *regs,
1010                                             address entry_address[AdapterBlob::ENTRY_COUNT]) {
1011   entry_address[AdapterBlob::I2C] = __ pc();
1012 
1013   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
1014 
1015   // -------------------------------------------------------------------------
1016   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1017   // to the interpreter.  The args start out packed in the compiled layout.  They
1018   // need to be unpacked into the interpreter layout.  This will almost always
1019   // require some stack space.  We grow the current (compiled) stack, then repack
1020   // the args.  We  finally end in a jump to the generic interpreter entry point.
1021   // On exit from the interpreter, the interpreter will restore our SP (lest the
1022   // compiled code, which relies solely on SP and not RBP, get sick).
1023 
1024   entry_address[AdapterBlob::C2I_Unverified] = __ pc();
1025   Label skip_fixup;
1026 
1027   Register data = rax;
1028   Register receiver = j_rarg0;
1029   Register temp = rbx;
1030 
1031   {
1032     __ ic_check(1 /* end_alignment */);
1033     __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1034     // Method might have been compiled since the call site was patched to
1035     // interpreted if that is the case treat it as a miss so we can get
1036     // the call site corrected.
1037     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1038     __ jcc(Assembler::equal, skip_fixup);
1039     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1040   }
1041 
1042   entry_address[AdapterBlob::C2I] = __ pc();
1043 
1044   // Class initialization barrier for static methods
1045   entry_address[AdapterBlob::C2I_No_Clinit_Check] = nullptr;
1046   if (VM_Version::supports_fast_class_init_checks()) {
1047     Label L_skip_barrier;
1048     Register method = rbx;
1049 
1050     { // Bypass the barrier for non-static methods
1051       Register flags = rscratch1;
1052       __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
1053       __ testl(flags, JVM_ACC_STATIC);
1054       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1055     }
1056 
1057     Register klass = rscratch1;
1058     __ load_method_holder(klass, method);
1059     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1060 
1061     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1062 
1063     __ bind(L_skip_barrier);
1064     entry_address[AdapterBlob::C2I_No_Clinit_Check] = __ pc();
1065   }
1066 
1067   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1068   bs->c2i_entry_barrier(masm);
1069 
1070   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1071   return;
1072 }
1073 
1074 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1075                                          VMRegPair *regs,
1076                                          int total_args_passed) {
1077 
1078 // We return the amount of VMRegImpl stack slots we need to reserve for all
1079 // the arguments NOT counting out_preserve_stack_slots.
1080 
1081 // NOTE: These arrays will have to change when c1 is ported
1082 #ifdef _WIN64
1083     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1084       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1085     };
1086     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1087       c_farg0, c_farg1, c_farg2, c_farg3
1088     };
1089 #else
1090     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1091       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1092     };
1093     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1094       c_farg0, c_farg1, c_farg2, c_farg3,
1095       c_farg4, c_farg5, c_farg6, c_farg7
1096     };
1097 #endif // _WIN64
1098 
1099 
1100     uint int_args = 0;
1101     uint fp_args = 0;
1102     uint stk_args = 0; // inc by 2 each time
1103 
1104     for (int i = 0; i < total_args_passed; i++) {
1105       switch (sig_bt[i]) {
1106       case T_BOOLEAN:
1107       case T_CHAR:
1108       case T_BYTE:
1109       case T_SHORT:
1110       case T_INT:
1111         if (int_args < Argument::n_int_register_parameters_c) {
1112           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1113 #ifdef _WIN64
1114           fp_args++;
1115           // Allocate slots for callee to stuff register args the stack.
1116           stk_args += 2;
1117 #endif
1118         } else {
1119           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1120           stk_args += 2;
1121         }
1122         break;
1123       case T_LONG:
1124         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1125         // fall through
1126       case T_OBJECT:
1127       case T_ARRAY:
1128       case T_ADDRESS:
1129       case T_METADATA:
1130         if (int_args < Argument::n_int_register_parameters_c) {
1131           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1132 #ifdef _WIN64
1133           fp_args++;
1134           stk_args += 2;
1135 #endif
1136         } else {
1137           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1138           stk_args += 2;
1139         }
1140         break;
1141       case T_FLOAT:
1142         if (fp_args < Argument::n_float_register_parameters_c) {
1143           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1144 #ifdef _WIN64
1145           int_args++;
1146           // Allocate slots for callee to stuff register args the stack.
1147           stk_args += 2;
1148 #endif
1149         } else {
1150           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1151           stk_args += 2;
1152         }
1153         break;
1154       case T_DOUBLE:
1155         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1156         if (fp_args < Argument::n_float_register_parameters_c) {
1157           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1158 #ifdef _WIN64
1159           int_args++;
1160           // Allocate slots for callee to stuff register args the stack.
1161           stk_args += 2;
1162 #endif
1163         } else {
1164           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1165           stk_args += 2;
1166         }
1167         break;
1168       case T_VOID: // Halves of longs and doubles
1169         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1170         regs[i].set_bad();
1171         break;
1172       default:
1173         ShouldNotReachHere();
1174         break;
1175       }
1176     }
1177 #ifdef _WIN64
1178   // windows abi requires that we always allocate enough stack space
1179   // for 4 64bit registers to be stored down.
1180   if (stk_args < 8) {
1181     stk_args = 8;
1182   }
1183 #endif // _WIN64
1184 
1185   return stk_args;
1186 }
1187 
1188 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1189                                              uint num_bits,
1190                                              uint total_args_passed) {
1191   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1192          "only certain vector sizes are supported for now");
1193 
1194   static const XMMRegister VEC_ArgReg[32] = {
1195      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1196      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1197     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1198     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1199   };
1200 
1201   uint stk_args = 0;
1202   uint fp_args = 0;
1203 
1204   for (uint i = 0; i < total_args_passed; i++) {
1205     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1206     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1207     regs[i].set_pair(vmreg->next(next_val), vmreg);
1208   }
1209 
1210   return stk_args;
1211 }
1212 
1213 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1214   // We always ignore the frame_slots arg and just use the space just below frame pointer
1215   // which by this time is free to use
1216   switch (ret_type) {
1217   case T_FLOAT:
1218     __ movflt(Address(rbp, -wordSize), xmm0);
1219     break;
1220   case T_DOUBLE:
1221     __ movdbl(Address(rbp, -wordSize), xmm0);
1222     break;
1223   case T_VOID:  break;
1224   default: {
1225     __ movptr(Address(rbp, -wordSize), rax);
1226     }
1227   }
1228 }
1229 
1230 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1231   // We always ignore the frame_slots arg and just use the space just below frame pointer
1232   // which by this time is free to use
1233   switch (ret_type) {
1234   case T_FLOAT:
1235     __ movflt(xmm0, Address(rbp, -wordSize));
1236     break;
1237   case T_DOUBLE:
1238     __ movdbl(xmm0, Address(rbp, -wordSize));
1239     break;
1240   case T_VOID:  break;
1241   default: {
1242     __ movptr(rax, Address(rbp, -wordSize));
1243     }
1244   }
1245 }
1246 
1247 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1248     for ( int i = first_arg ; i < arg_count ; i++ ) {
1249       if (args[i].first()->is_Register()) {
1250         __ push(args[i].first()->as_Register());
1251       } else if (args[i].first()->is_XMMRegister()) {
1252         __ subptr(rsp, 2*wordSize);
1253         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1254       }
1255     }
1256 }
1257 
1258 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1259     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1260       if (args[i].first()->is_Register()) {
1261         __ pop(args[i].first()->as_Register());
1262       } else if (args[i].first()->is_XMMRegister()) {
1263         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1264         __ addptr(rsp, 2*wordSize);
1265       }
1266     }
1267 }
1268 
1269 static void verify_oop_args(MacroAssembler* masm,
1270                             const methodHandle& method,
1271                             const BasicType* sig_bt,
1272                             const VMRegPair* regs) {
1273   Register temp_reg = rbx;  // not part of any compiled calling seq
1274   if (VerifyOops) {
1275     for (int i = 0; i < method->size_of_parameters(); i++) {
1276       if (is_reference_type(sig_bt[i])) {
1277         VMReg r = regs[i].first();
1278         assert(r->is_valid(), "bad oop arg");
1279         if (r->is_stack()) {
1280           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1281           __ verify_oop(temp_reg);
1282         } else {
1283           __ verify_oop(r->as_Register());
1284         }
1285       }
1286     }
1287   }
1288 }
1289 
1290 static void check_continuation_enter_argument(VMReg actual_vmreg,
1291                                               Register expected_reg,
1292                                               const char* name) {
1293   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1294   assert(actual_vmreg->as_Register() == expected_reg,
1295          "%s is in unexpected register: %s instead of %s",
1296          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1297 }
1298 
1299 
1300 //---------------------------- continuation_enter_setup ---------------------------
1301 //
1302 // Arguments:
1303 //   None.
1304 //
1305 // Results:
1306 //   rsp: pointer to blank ContinuationEntry
1307 //
1308 // Kills:
1309 //   rax
1310 //
1311 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1312   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1313   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1314   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1315 
1316   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1317   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1318 
1319   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1320   OopMap* map = new OopMap(frame_size, 0);
1321 
1322   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1323   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1324   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1325 
1326   return map;
1327 }
1328 
1329 //---------------------------- fill_continuation_entry ---------------------------
1330 //
1331 // Arguments:
1332 //   rsp: pointer to blank Continuation entry
1333 //   reg_cont_obj: pointer to the continuation
1334 //   reg_flags: flags
1335 //
1336 // Results:
1337 //   rsp: pointer to filled out ContinuationEntry
1338 //
1339 // Kills:
1340 //   rax
1341 //
1342 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1343   assert_different_registers(rax, reg_cont_obj, reg_flags);
1344 #ifdef ASSERT
1345   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1346 #endif
1347   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1348   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1349   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1350   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1351   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1352 
1353   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1354   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1355   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1356   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1357 
1358   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1359   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1360 }
1361 
1362 //---------------------------- continuation_enter_cleanup ---------------------------
1363 //
1364 // Arguments:
1365 //   rsp: pointer to the ContinuationEntry
1366 //
1367 // Results:
1368 //   rsp: pointer to the spilled rbp in the entry frame
1369 //
1370 // Kills:
1371 //   rbx
1372 //
1373 static void continuation_enter_cleanup(MacroAssembler* masm) {
1374 #ifdef ASSERT
1375   Label L_good_sp;
1376   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1377   __ jcc(Assembler::equal, L_good_sp);
1378   __ stop("Incorrect rsp at continuation_enter_cleanup");
1379   __ bind(L_good_sp);
1380 #endif
1381   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1382   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1383 
1384   if (CheckJNICalls) {
1385     // Check if this is a virtual thread continuation
1386     Label L_skip_vthread_code;
1387     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1388     __ jcc(Assembler::equal, L_skip_vthread_code);
1389 
1390     // If the held monitor count is > 0 and this vthread is terminating then
1391     // it failed to release a JNI monitor. So we issue the same log message
1392     // that JavaThread::exit does.
1393     __ cmpptr(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1394     __ jcc(Assembler::equal, L_skip_vthread_code);
1395 
1396     // rax may hold an exception oop, save it before the call
1397     __ push(rax);
1398     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::log_jni_monitor_still_held));
1399     __ pop(rax);
1400 
1401     // For vthreads we have to explicitly zero the JNI monitor count of the carrier
1402     // on termination. The held count is implicitly zeroed below when we restore from
1403     // the parent held count (which has to be zero).
1404     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1405 
1406     __ bind(L_skip_vthread_code);
1407   }
1408 #ifdef ASSERT
1409   else {
1410     // Check if this is a virtual thread continuation
1411     Label L_skip_vthread_code;
1412     __ cmpl(Address(rsp, ContinuationEntry::flags_offset()), 0);
1413     __ jcc(Assembler::equal, L_skip_vthread_code);
1414 
1415     // See comment just above. If not checking JNI calls the JNI count is only
1416     // needed for assertion checking.
1417     __ movq(Address(r15_thread, JavaThread::jni_monitor_count_offset()), 0);
1418 
1419     __ bind(L_skip_vthread_code);
1420   }
1421 #endif
1422 
1423   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1424   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1425 
1426   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1427   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1428   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1429 }
1430 
1431 static void gen_continuation_enter(MacroAssembler* masm,
1432                                    const VMRegPair* regs,
1433                                    int& exception_offset,
1434                                    OopMapSet* oop_maps,
1435                                    int& frame_complete,
1436                                    int& stack_slots,
1437                                    int& interpreted_entry_offset,
1438                                    int& compiled_entry_offset) {
1439 
1440   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1441   int pos_cont_obj   = 0;
1442   int pos_is_cont    = 1;
1443   int pos_is_virtual = 2;
1444 
1445   // The platform-specific calling convention may present the arguments in various registers.
1446   // To simplify the rest of the code, we expect the arguments to reside at these known
1447   // registers, and we additionally check the placement here in case calling convention ever
1448   // changes.
1449   Register reg_cont_obj   = c_rarg1;
1450   Register reg_is_cont    = c_rarg2;
1451   Register reg_is_virtual = c_rarg3;
1452 
1453   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1454   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1455   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1456 
1457   // Utility methods kill rax, make sure there are no collisions
1458   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1459 
1460   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1461                          relocInfo::static_call_type);
1462 
1463   address start = __ pc();
1464 
1465   Label L_thaw, L_exit;
1466 
1467   // i2i entry used at interp_only_mode only
1468   interpreted_entry_offset = __ pc() - start;
1469   {
1470 #ifdef ASSERT
1471     Label is_interp_only;
1472     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1473     __ jcc(Assembler::notEqual, is_interp_only);
1474     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1475     __ bind(is_interp_only);
1476 #endif
1477 
1478     __ pop(rax); // return address
1479     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1480     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1481     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1482     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1483     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1484     __ push(rax); // return address
1485     __ push_cont_fastpath();
1486 
1487     __ enter();
1488 
1489     stack_slots = 2; // will be adjusted in setup
1490     OopMap* map = continuation_enter_setup(masm, stack_slots);
1491     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1492     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1493 
1494     __ verify_oop(reg_cont_obj);
1495 
1496     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1497 
1498     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1499     __ testptr(reg_is_cont, reg_is_cont);
1500     __ jcc(Assembler::notZero, L_thaw);
1501 
1502     // --- Resolve path
1503 
1504     // Make sure the call is patchable
1505     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1506     // Emit stub for static call
1507     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1508     if (stub == nullptr) {
1509       fatal("CodeCache is full at gen_continuation_enter");
1510     }
1511     __ call(resolve);
1512     oop_maps->add_gc_map(__ pc() - start, map);
1513     __ post_call_nop();
1514 
1515     __ jmp(L_exit);
1516   }
1517 
1518   // compiled entry
1519   __ align(CodeEntryAlignment);
1520   compiled_entry_offset = __ pc() - start;
1521   __ enter();
1522 
1523   stack_slots = 2; // will be adjusted in setup
1524   OopMap* map = continuation_enter_setup(masm, stack_slots);
1525 
1526   // Frame is now completed as far as size and linkage.
1527   frame_complete = __ pc() - start;
1528 
1529   __ verify_oop(reg_cont_obj);
1530 
1531   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1532 
1533   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1534   __ testptr(reg_is_cont, reg_is_cont);
1535   __ jccb(Assembler::notZero, L_thaw);
1536 
1537   // --- call Continuation.enter(Continuation c, boolean isContinue)
1538 
1539   // Make sure the call is patchable
1540   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1541 
1542   // Emit stub for static call
1543   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1544   if (stub == nullptr) {
1545     fatal("CodeCache is full at gen_continuation_enter");
1546   }
1547 
1548   // The call needs to be resolved. There's a special case for this in
1549   // SharedRuntime::find_callee_info_helper() which calls
1550   // LinkResolver::resolve_continuation_enter() which resolves the call to
1551   // Continuation.enter(Continuation c, boolean isContinue).
1552   __ call(resolve);
1553 
1554   oop_maps->add_gc_map(__ pc() - start, map);
1555   __ post_call_nop();
1556 
1557   __ jmpb(L_exit);
1558 
1559   // --- Thawing path
1560 
1561   __ bind(L_thaw);
1562 
1563   ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1564   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1565 
1566   ContinuationEntry::_return_pc_offset = __ pc() - start;
1567   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1568   __ post_call_nop();
1569 
1570   // --- Normal exit (resolve/thawing)
1571 
1572   __ bind(L_exit);
1573   ContinuationEntry::_cleanup_offset = __ pc() - start;
1574   continuation_enter_cleanup(masm);
1575   __ pop(rbp);
1576   __ ret(0);
1577 
1578   // --- Exception handling path
1579 
1580   exception_offset = __ pc() - start;
1581 
1582   continuation_enter_cleanup(masm);
1583   __ pop(rbp);
1584 
1585   __ movptr(c_rarg0, r15_thread);
1586   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1587 
1588   // rax still holds the original exception oop, save it before the call
1589   __ push(rax);
1590 
1591   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1592   __ movptr(rbx, rax);
1593 
1594   // Continue at exception handler:
1595   //   rax: exception oop
1596   //   rbx: exception handler
1597   //   rdx: exception pc
1598   __ pop(rax);
1599   __ verify_oop(rax);
1600   __ pop(rdx);
1601   __ jmp(rbx);
1602 }
1603 
1604 static void gen_continuation_yield(MacroAssembler* masm,
1605                                    const VMRegPair* regs,
1606                                    OopMapSet* oop_maps,
1607                                    int& frame_complete,
1608                                    int& stack_slots,
1609                                    int& compiled_entry_offset) {
1610   enum layout {
1611     rbp_off,
1612     rbpH_off,
1613     return_off,
1614     return_off2,
1615     framesize // inclusive of return address
1616   };
1617   stack_slots = framesize /  VMRegImpl::slots_per_word;
1618   assert(stack_slots == 2, "recheck layout");
1619 
1620   address start = __ pc();
1621   compiled_entry_offset = __ pc() - start;
1622   __ enter();
1623   address the_pc = __ pc();
1624 
1625   frame_complete = the_pc - start;
1626 
1627   // This nop must be exactly at the PC we push into the frame info.
1628   // We use this nop for fast CodeBlob lookup, associate the OopMap
1629   // with it right away.
1630   __ post_call_nop();
1631   OopMap* map = new OopMap(framesize, 1);
1632   oop_maps->add_gc_map(frame_complete, map);
1633 
1634   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1635   __ movptr(c_rarg0, r15_thread);
1636   __ movptr(c_rarg1, rsp);
1637   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1638   __ reset_last_Java_frame(true);
1639 
1640   Label L_pinned;
1641 
1642   __ testptr(rax, rax);
1643   __ jcc(Assembler::notZero, L_pinned);
1644 
1645   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1646   continuation_enter_cleanup(masm);
1647   __ pop(rbp);
1648   __ ret(0);
1649 
1650   __ bind(L_pinned);
1651 
1652   // Pinned, return to caller
1653 
1654   // handle pending exception thrown by freeze
1655   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1656   Label ok;
1657   __ jcc(Assembler::equal, ok);
1658   __ leave();
1659   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1660   __ bind(ok);
1661 
1662   __ leave();
1663   __ ret(0);
1664 }
1665 
1666 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1667   ::continuation_enter_cleanup(masm);
1668 }
1669 
1670 static void gen_special_dispatch(MacroAssembler* masm,
1671                                  const methodHandle& method,
1672                                  const BasicType* sig_bt,
1673                                  const VMRegPair* regs) {
1674   verify_oop_args(masm, method, sig_bt, regs);
1675   vmIntrinsics::ID iid = method->intrinsic_id();
1676 
1677   // Now write the args into the outgoing interpreter space
1678   bool     has_receiver   = false;
1679   Register receiver_reg   = noreg;
1680   int      member_arg_pos = -1;
1681   Register member_reg     = noreg;
1682   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1683   if (ref_kind != 0) {
1684     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1685     member_reg = rbx;  // known to be free at this point
1686     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1687   } else if (iid == vmIntrinsics::_invokeBasic) {
1688     has_receiver = true;
1689   } else if (iid == vmIntrinsics::_linkToNative) {
1690     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1691     member_reg = rbx;  // known to be free at this point
1692   } else {
1693     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1694   }
1695 
1696   if (member_reg != noreg) {
1697     // Load the member_arg into register, if necessary.
1698     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1699     VMReg r = regs[member_arg_pos].first();
1700     if (r->is_stack()) {
1701       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1702     } else {
1703       // no data motion is needed
1704       member_reg = r->as_Register();
1705     }
1706   }
1707 
1708   if (has_receiver) {
1709     // Make sure the receiver is loaded into a register.
1710     assert(method->size_of_parameters() > 0, "oob");
1711     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1712     VMReg r = regs[0].first();
1713     assert(r->is_valid(), "bad receiver arg");
1714     if (r->is_stack()) {
1715       // Porting note:  This assumes that compiled calling conventions always
1716       // pass the receiver oop in a register.  If this is not true on some
1717       // platform, pick a temp and load the receiver from stack.
1718       fatal("receiver always in a register");
1719       receiver_reg = j_rarg0;  // known to be free at this point
1720       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1721     } else {
1722       // no data motion is needed
1723       receiver_reg = r->as_Register();
1724     }
1725   }
1726 
1727   // Figure out which address we are really jumping to:
1728   MethodHandles::generate_method_handle_dispatch(masm, iid,
1729                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1730 }
1731 
1732 // ---------------------------------------------------------------------------
1733 // Generate a native wrapper for a given method.  The method takes arguments
1734 // in the Java compiled code convention, marshals them to the native
1735 // convention (handlizes oops, etc), transitions to native, makes the call,
1736 // returns to java state (possibly blocking), unhandlizes any result and
1737 // returns.
1738 //
1739 // Critical native functions are a shorthand for the use of
1740 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1741 // functions.  The wrapper is expected to unpack the arguments before
1742 // passing them to the callee. Critical native functions leave the state _in_Java,
1743 // since they cannot stop for GC.
1744 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1745 // block and the check for pending exceptions it's impossible for them
1746 // to be thrown.
1747 //
1748 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1749                                                 const methodHandle& method,
1750                                                 int compile_id,
1751                                                 BasicType* in_sig_bt,
1752                                                 VMRegPair* in_regs,
1753                                                 BasicType ret_type) {
1754   if (method->is_continuation_native_intrinsic()) {
1755     int exception_offset = -1;
1756     OopMapSet* oop_maps = new OopMapSet();
1757     int frame_complete = -1;
1758     int stack_slots = -1;
1759     int interpreted_entry_offset = -1;
1760     int vep_offset = -1;
1761     if (method->is_continuation_enter_intrinsic()) {
1762       gen_continuation_enter(masm,
1763                              in_regs,
1764                              exception_offset,
1765                              oop_maps,
1766                              frame_complete,
1767                              stack_slots,
1768                              interpreted_entry_offset,
1769                              vep_offset);
1770     } else if (method->is_continuation_yield_intrinsic()) {
1771       gen_continuation_yield(masm,
1772                              in_regs,
1773                              oop_maps,
1774                              frame_complete,
1775                              stack_slots,
1776                              vep_offset);
1777     } else {
1778       guarantee(false, "Unknown Continuation native intrinsic");
1779     }
1780 
1781 #ifdef ASSERT
1782     if (method->is_continuation_enter_intrinsic()) {
1783       assert(interpreted_entry_offset != -1, "Must be set");
1784       assert(exception_offset != -1,         "Must be set");
1785     } else {
1786       assert(interpreted_entry_offset == -1, "Must be unset");
1787       assert(exception_offset == -1,         "Must be unset");
1788     }
1789     assert(frame_complete != -1,    "Must be set");
1790     assert(stack_slots != -1,       "Must be set");
1791     assert(vep_offset != -1,        "Must be set");
1792 #endif
1793 
1794     __ flush();
1795     nmethod* nm = nmethod::new_native_nmethod(method,
1796                                               compile_id,
1797                                               masm->code(),
1798                                               vep_offset,
1799                                               frame_complete,
1800                                               stack_slots,
1801                                               in_ByteSize(-1),
1802                                               in_ByteSize(-1),
1803                                               oop_maps,
1804                                               exception_offset);
1805     if (nm == nullptr) return nm;
1806     if (method->is_continuation_enter_intrinsic()) {
1807       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1808     } else if (method->is_continuation_yield_intrinsic()) {
1809       ContinuationEntry::set_yield_code(nm);
1810     }
1811     return nm;
1812   }
1813 
1814   if (method->is_method_handle_intrinsic()) {
1815     vmIntrinsics::ID iid = method->intrinsic_id();
1816     intptr_t start = (intptr_t)__ pc();
1817     int vep_offset = ((intptr_t)__ pc()) - start;
1818     gen_special_dispatch(masm,
1819                          method,
1820                          in_sig_bt,
1821                          in_regs);
1822     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1823     __ flush();
1824     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1825     return nmethod::new_native_nmethod(method,
1826                                        compile_id,
1827                                        masm->code(),
1828                                        vep_offset,
1829                                        frame_complete,
1830                                        stack_slots / VMRegImpl::slots_per_word,
1831                                        in_ByteSize(-1),
1832                                        in_ByteSize(-1),
1833                                        nullptr);
1834   }
1835   address native_func = method->native_function();
1836   assert(native_func != nullptr, "must have function");
1837 
1838   // An OopMap for lock (and class if static)
1839   OopMapSet *oop_maps = new OopMapSet();
1840   intptr_t start = (intptr_t)__ pc();
1841 
1842   // We have received a description of where all the java arg are located
1843   // on entry to the wrapper. We need to convert these args to where
1844   // the jni function will expect them. To figure out where they go
1845   // we convert the java signature to a C signature by inserting
1846   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1847 
1848   const int total_in_args = method->size_of_parameters();
1849   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1850 
1851   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1852   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1853 
1854   int argc = 0;
1855   out_sig_bt[argc++] = T_ADDRESS;
1856   if (method->is_static()) {
1857     out_sig_bt[argc++] = T_OBJECT;
1858   }
1859 
1860   for (int i = 0; i < total_in_args ; i++ ) {
1861     out_sig_bt[argc++] = in_sig_bt[i];
1862   }
1863 
1864   // Now figure out where the args must be stored and how much stack space
1865   // they require.
1866   int out_arg_slots;
1867   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1868 
1869   // Compute framesize for the wrapper.  We need to handlize all oops in
1870   // incoming registers
1871 
1872   // Calculate the total number of stack slots we will need.
1873 
1874   // First count the abi requirement plus all of the outgoing args
1875   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1876 
1877   // Now the space for the inbound oop handle area
1878   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1879 
1880   int oop_handle_offset = stack_slots;
1881   stack_slots += total_save_slots;
1882 
1883   // Now any space we need for handlizing a klass if static method
1884 
1885   int klass_slot_offset = 0;
1886   int klass_offset = -1;
1887   int lock_slot_offset = 0;
1888   bool is_static = false;
1889 
1890   if (method->is_static()) {
1891     klass_slot_offset = stack_slots;
1892     stack_slots += VMRegImpl::slots_per_word;
1893     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1894     is_static = true;
1895   }
1896 
1897   // Plus a lock if needed
1898 
1899   if (method->is_synchronized()) {
1900     lock_slot_offset = stack_slots;
1901     stack_slots += VMRegImpl::slots_per_word;
1902   }
1903 
1904   // Now a place (+2) to save return values or temp during shuffling
1905   // + 4 for return address (which we own) and saved rbp
1906   stack_slots += 6;
1907 
1908   // Ok The space we have allocated will look like:
1909   //
1910   //
1911   // FP-> |                     |
1912   //      |---------------------|
1913   //      | 2 slots for moves   |
1914   //      |---------------------|
1915   //      | lock box (if sync)  |
1916   //      |---------------------| <- lock_slot_offset
1917   //      | klass (if static)   |
1918   //      |---------------------| <- klass_slot_offset
1919   //      | oopHandle area      |
1920   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1921   //      | outbound memory     |
1922   //      | based arguments     |
1923   //      |                     |
1924   //      |---------------------|
1925   //      |                     |
1926   // SP-> | out_preserved_slots |
1927   //
1928   //
1929 
1930 
1931   // Now compute actual number of stack words we need rounding to make
1932   // stack properly aligned.
1933   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1934 
1935   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1936 
1937   // First thing make an ic check to see if we should even be here
1938 
1939   // We are free to use all registers as temps without saving them and
1940   // restoring them except rbp. rbp is the only callee save register
1941   // as far as the interpreter and the compiler(s) are concerned.
1942 
1943   const Register receiver = j_rarg0;
1944 
1945   Label exception_pending;
1946 
1947   assert_different_registers(receiver, rscratch1, rscratch2);
1948   __ verify_oop(receiver);
1949   __ ic_check(8 /* end_alignment */);
1950 
1951   int vep_offset = ((intptr_t)__ pc()) - start;
1952 
1953   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1954     Label L_skip_barrier;
1955     Register klass = r10;
1956     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1957     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1958 
1959     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1960 
1961     __ bind(L_skip_barrier);
1962   }
1963 
1964 #ifdef COMPILER1
1965   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1966   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1967     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1968   }
1969 #endif // COMPILER1
1970 
1971   // The instruction at the verified entry point must be 5 bytes or longer
1972   // because it can be patched on the fly by make_non_entrant. The stack bang
1973   // instruction fits that requirement.
1974 
1975   // Generate stack overflow check
1976   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1977 
1978   // Generate a new frame for the wrapper.
1979   __ enter();
1980   // -2 because return address is already present and so is saved rbp
1981   __ subptr(rsp, stack_size - 2*wordSize);
1982 
1983   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1984   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
1985   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
1986 
1987   // Frame is now completed as far as size and linkage.
1988   int frame_complete = ((intptr_t)__ pc()) - start;
1989 
1990 #ifdef ASSERT
1991   __ check_stack_alignment(rsp, "improperly aligned stack");
1992 #endif /* ASSERT */
1993 
1994 
1995   // We use r14 as the oop handle for the receiver/klass
1996   // It is callee save so it survives the call to native
1997 
1998   const Register oop_handle_reg = r14;
1999 
2000   //
2001   // We immediately shuffle the arguments so that any vm call we have to
2002   // make from here on out (sync slow path, jvmti, etc.) we will have
2003   // captured the oops from our caller and have a valid oopMap for
2004   // them.
2005 
2006   // -----------------
2007   // The Grand Shuffle
2008 
2009   // The Java calling convention is either equal (linux) or denser (win64) than the
2010   // c calling convention. However the because of the jni_env argument the c calling
2011   // convention always has at least one more (and two for static) arguments than Java.
2012   // Therefore if we move the args from java -> c backwards then we will never have
2013   // a register->register conflict and we don't have to build a dependency graph
2014   // and figure out how to break any cycles.
2015   //
2016 
2017   // Record esp-based slot for receiver on stack for non-static methods
2018   int receiver_offset = -1;
2019 
2020   // This is a trick. We double the stack slots so we can claim
2021   // the oops in the caller's frame. Since we are sure to have
2022   // more args than the caller doubling is enough to make
2023   // sure we can capture all the incoming oop args from the
2024   // caller.
2025   //
2026   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2027 
2028   // Mark location of rbp (someday)
2029   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2030 
2031   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2032   // All inbound args are referenced based on rbp and all outbound args via rsp.
2033 
2034 
2035 #ifdef ASSERT
2036   bool reg_destroyed[Register::number_of_registers];
2037   bool freg_destroyed[XMMRegister::number_of_registers];
2038   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2039     reg_destroyed[r] = false;
2040   }
2041   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2042     freg_destroyed[f] = false;
2043   }
2044 
2045 #endif /* ASSERT */
2046 
2047   // For JNI natives the incoming and outgoing registers are offset upwards.
2048   GrowableArray<int> arg_order(2 * total_in_args);
2049 
2050   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2051     arg_order.push(i);
2052     arg_order.push(c_arg);
2053   }
2054 
2055   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2056     int i = arg_order.at(ai);
2057     int c_arg = arg_order.at(ai + 1);
2058     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2059 #ifdef ASSERT
2060     if (in_regs[i].first()->is_Register()) {
2061       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2062     } else if (in_regs[i].first()->is_XMMRegister()) {
2063       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2064     }
2065     if (out_regs[c_arg].first()->is_Register()) {
2066       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2067     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2068       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2069     }
2070 #endif /* ASSERT */
2071     switch (in_sig_bt[i]) {
2072       case T_ARRAY:
2073       case T_OBJECT:
2074         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2075                     ((i == 0) && (!is_static)),
2076                     &receiver_offset);
2077         break;
2078       case T_VOID:
2079         break;
2080 
2081       case T_FLOAT:
2082         __ float_move(in_regs[i], out_regs[c_arg]);
2083           break;
2084 
2085       case T_DOUBLE:
2086         assert( i + 1 < total_in_args &&
2087                 in_sig_bt[i + 1] == T_VOID &&
2088                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2089         __ double_move(in_regs[i], out_regs[c_arg]);
2090         break;
2091 
2092       case T_LONG :
2093         __ long_move(in_regs[i], out_regs[c_arg]);
2094         break;
2095 
2096       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2097 
2098       default:
2099         __ move32_64(in_regs[i], out_regs[c_arg]);
2100     }
2101   }
2102 
2103   int c_arg;
2104 
2105   // Pre-load a static method's oop into r14.  Used both by locking code and
2106   // the normal JNI call code.
2107   // point c_arg at the first arg that is already loaded in case we
2108   // need to spill before we call out
2109   c_arg = total_c_args - total_in_args;
2110 
2111   if (method->is_static()) {
2112 
2113     //  load oop into a register
2114     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2115 
2116     // Now handlize the static class mirror it's known not-null.
2117     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2118     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2119 
2120     // Now get the handle
2121     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2122     // store the klass handle as second argument
2123     __ movptr(c_rarg1, oop_handle_reg);
2124     // and protect the arg if we must spill
2125     c_arg--;
2126   }
2127 
2128   // Change state to native (we save the return address in the thread, since it might not
2129   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2130   // points into the right code segment. It does not have to be the correct return pc.
2131   // We use the same pc/oopMap repeatedly when we call out
2132 
2133   Label native_return;
2134   if (method->is_object_wait0()) {
2135     // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2136     __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2137   } else {
2138     intptr_t the_pc = (intptr_t) __ pc();
2139     oop_maps->add_gc_map(the_pc - start, map);
2140 
2141     __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2142   }
2143 
2144   // We have all of the arguments setup at this point. We must not touch any register
2145   // argument registers at this point (what if we save/restore them there are no oop?
2146 
2147   if (DTraceMethodProbes) {
2148     // protect the args we've loaded
2149     save_args(masm, total_c_args, c_arg, out_regs);
2150     __ mov_metadata(c_rarg1, method());
2151     __ call_VM_leaf(
2152       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2153       r15_thread, c_rarg1);
2154     restore_args(masm, total_c_args, c_arg, out_regs);
2155   }
2156 
2157   // RedefineClasses() tracing support for obsolete method entry
2158   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2159     // protect the args we've loaded
2160     save_args(masm, total_c_args, c_arg, out_regs);
2161     __ mov_metadata(c_rarg1, method());
2162     __ call_VM_leaf(
2163       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2164       r15_thread, c_rarg1);
2165     restore_args(masm, total_c_args, c_arg, out_regs);
2166   }
2167 
2168   // Lock a synchronized method
2169 
2170   // Register definitions used by locking and unlocking
2171 
2172   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2173   const Register obj_reg  = rbx;  // Will contain the oop
2174   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2175 
2176   Label slow_path_lock;
2177   Label lock_done;
2178 
2179   if (method->is_synchronized()) {
2180     // Get the handle (the 2nd argument)
2181     __ mov(oop_handle_reg, c_rarg1);
2182 
2183     // Get address of the box
2184 
2185     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2186 
2187     // Load the oop from the handle
2188     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2189 
2190     __ lightweight_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock);
2191 
2192     // Slow path will re-enter here
2193     __ bind(lock_done);
2194   }
2195 
2196   // Finally just about ready to make the JNI call
2197 
2198   // get JNIEnv* which is first argument to native
2199   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2200 
2201   // Now set thread in native
2202   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2203 
2204   __ call(RuntimeAddress(native_func));
2205 
2206   // Verify or restore cpu control state after JNI call
2207   __ restore_cpu_control_state_after_jni(rscratch1);
2208 
2209   // Unpack native results.
2210   switch (ret_type) {
2211   case T_BOOLEAN: __ c2bool(rax);            break;
2212   case T_CHAR   : __ movzwl(rax, rax);      break;
2213   case T_BYTE   : __ sign_extend_byte (rax); break;
2214   case T_SHORT  : __ sign_extend_short(rax); break;
2215   case T_INT    : /* nothing to do */        break;
2216   case T_DOUBLE :
2217   case T_FLOAT  :
2218     // Result is in xmm0 we'll save as needed
2219     break;
2220   case T_ARRAY:                 // Really a handle
2221   case T_OBJECT:                // Really a handle
2222       break; // can't de-handlize until after safepoint check
2223   case T_VOID: break;
2224   case T_LONG: break;
2225   default       : ShouldNotReachHere();
2226   }
2227 
2228   // Switch thread to "native transition" state before reading the synchronization state.
2229   // This additional state is necessary because reading and testing the synchronization
2230   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2231   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2232   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2233   //     Thread A is resumed to finish this native method, but doesn't block here since it
2234   //     didn't see any synchronization is progress, and escapes.
2235   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2236 
2237   // Force this write out before the read below
2238   if (!UseSystemMemoryBarrier) {
2239     __ membar(Assembler::Membar_mask_bits(
2240               Assembler::LoadLoad | Assembler::LoadStore |
2241               Assembler::StoreLoad | Assembler::StoreStore));
2242   }
2243 
2244   // check for safepoint operation in progress and/or pending suspend requests
2245   {
2246     Label Continue;
2247     Label slow_path;
2248 
2249     __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */);
2250 
2251     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2252     __ jcc(Assembler::equal, Continue);
2253     __ bind(slow_path);
2254 
2255     // Don't use call_VM as it will see a possible pending exception and forward it
2256     // and never return here preventing us from clearing _last_native_pc down below.
2257     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2258     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2259     // by hand.
2260     //
2261     __ vzeroupper();
2262     save_native_result(masm, ret_type, stack_slots);
2263     __ mov(c_rarg0, r15_thread);
2264     __ mov(r12, rsp); // remember sp
2265     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2266     __ andptr(rsp, -16); // align stack as required by ABI
2267     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2268     __ mov(rsp, r12); // restore sp
2269     __ reinit_heapbase();
2270     // Restore any method result value
2271     restore_native_result(masm, ret_type, stack_slots);
2272     __ bind(Continue);
2273   }
2274 
2275   // change thread state
2276   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2277 
2278   if (method->is_object_wait0()) {
2279     // Check preemption for Object.wait()
2280     __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2281     __ cmpptr(rscratch1, NULL_WORD);
2282     __ jccb(Assembler::equal, native_return);
2283     __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2284     __ jmp(rscratch1);
2285     __ bind(native_return);
2286 
2287     intptr_t the_pc = (intptr_t) __ pc();
2288     oop_maps->add_gc_map(the_pc - start, map);
2289   }
2290 
2291 
2292   Label reguard;
2293   Label reguard_done;
2294   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2295   __ jcc(Assembler::equal, reguard);
2296   __ bind(reguard_done);
2297 
2298   // native result if any is live
2299 
2300   // Unlock
2301   Label slow_path_unlock;
2302   Label unlock_done;
2303   if (method->is_synchronized()) {
2304 
2305     Label fast_done;
2306 
2307     // Get locked oop from the handle we passed to jni
2308     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2309 
2310     // Must save rax if it is live now because cmpxchg must use it
2311     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2312       save_native_result(masm, ret_type, stack_slots);
2313     }
2314 
2315     __ lightweight_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2316 
2317     // slow path re-enters here
2318     __ bind(unlock_done);
2319     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2320       restore_native_result(masm, ret_type, stack_slots);
2321     }
2322 
2323     __ bind(fast_done);
2324   }
2325   if (DTraceMethodProbes) {
2326     save_native_result(masm, ret_type, stack_slots);
2327     __ mov_metadata(c_rarg1, method());
2328     __ call_VM_leaf(
2329          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2330          r15_thread, c_rarg1);
2331     restore_native_result(masm, ret_type, stack_slots);
2332   }
2333 
2334   __ reset_last_Java_frame(false);
2335 
2336   // Unbox oop result, e.g. JNIHandles::resolve value.
2337   if (is_reference_type(ret_type)) {
2338     __ resolve_jobject(rax /* value */,
2339                        rcx /* tmp */);
2340   }
2341 
2342   if (CheckJNICalls) {
2343     // clear_pending_jni_exception_check
2344     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2345   }
2346 
2347   // reset handle block
2348   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2349   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2350 
2351   // pop our frame
2352 
2353   __ leave();
2354 
2355 #if INCLUDE_JFR
2356   // We need to do a poll test after unwind in case the sampler
2357   // managed to sample the native frame after returning to Java.
2358   Label L_return;
2359   address poll_test_pc = __ pc();
2360   __ relocate(relocInfo::poll_return_type);
2361   __ testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2362   __ jccb(Assembler::zero, L_return);
2363   __ lea(rscratch1, InternalAddress(poll_test_pc));
2364   __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1);
2365   assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
2366     "polling page return stub not created yet");
2367   address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
2368   __ jump(RuntimeAddress(stub));
2369   __ bind(L_return);
2370 #endif // INCLUDE_JFR
2371 
2372   // Any exception pending?
2373   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2374   __ jcc(Assembler::notEqual, exception_pending);
2375 
2376   // Return
2377 
2378   __ ret(0);
2379 
2380   // Unexpected paths are out of line and go here
2381 
2382   // forward the exception
2383   __ bind(exception_pending);
2384 
2385   // and forward the exception
2386   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2387 
2388   // Slow path locking & unlocking
2389   if (method->is_synchronized()) {
2390 
2391     // BEGIN Slow path lock
2392     __ bind(slow_path_lock);
2393 
2394     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2395     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2396 
2397     // protect the args we've loaded
2398     save_args(masm, total_c_args, c_arg, out_regs);
2399 
2400     __ mov(c_rarg0, obj_reg);
2401     __ mov(c_rarg1, lock_reg);
2402     __ mov(c_rarg2, r15_thread);
2403 
2404     // Not a leaf but we have last_Java_frame setup as we want.
2405     // We don't want to unmount in case of contention since that would complicate preserving
2406     // the arguments that had already been marshalled into the native convention. So we force
2407     // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2408     // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2409     __ push_cont_fastpath();
2410     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2411     __ pop_cont_fastpath();
2412     restore_args(masm, total_c_args, c_arg, out_regs);
2413 
2414 #ifdef ASSERT
2415     { Label L;
2416     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2417     __ jcc(Assembler::equal, L);
2418     __ stop("no pending exception allowed on exit from monitorenter");
2419     __ bind(L);
2420     }
2421 #endif
2422     __ jmp(lock_done);
2423 
2424     // END Slow path lock
2425 
2426     // BEGIN Slow path unlock
2427     __ bind(slow_path_unlock);
2428 
2429     // If we haven't already saved the native result we must save it now as xmm registers
2430     // are still exposed.
2431     __ vzeroupper();
2432     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2433       save_native_result(masm, ret_type, stack_slots);
2434     }
2435 
2436     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2437 
2438     __ mov(c_rarg0, obj_reg);
2439     __ mov(c_rarg2, r15_thread);
2440     __ mov(r12, rsp); // remember sp
2441     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2442     __ andptr(rsp, -16); // align stack as required by ABI
2443 
2444     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2445     // NOTE that obj_reg == rbx currently
2446     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2447     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2448 
2449     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2450     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2451     __ mov(rsp, r12); // restore sp
2452     __ reinit_heapbase();
2453 #ifdef ASSERT
2454     {
2455       Label L;
2456       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2457       __ jcc(Assembler::equal, L);
2458       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2459       __ bind(L);
2460     }
2461 #endif /* ASSERT */
2462 
2463     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2464 
2465     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2466       restore_native_result(masm, ret_type, stack_slots);
2467     }
2468     __ jmp(unlock_done);
2469 
2470     // END Slow path unlock
2471 
2472   } // synchronized
2473 
2474   // SLOW PATH Reguard the stack if needed
2475 
2476   __ bind(reguard);
2477   __ vzeroupper();
2478   save_native_result(masm, ret_type, stack_slots);
2479   __ mov(r12, rsp); // remember sp
2480   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2481   __ andptr(rsp, -16); // align stack as required by ABI
2482   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2483   __ mov(rsp, r12); // restore sp
2484   __ reinit_heapbase();
2485   restore_native_result(masm, ret_type, stack_slots);
2486   // and continue
2487   __ jmp(reguard_done);
2488 
2489 
2490 
2491   __ flush();
2492 
2493   nmethod *nm = nmethod::new_native_nmethod(method,
2494                                             compile_id,
2495                                             masm->code(),
2496                                             vep_offset,
2497                                             frame_complete,
2498                                             stack_slots / VMRegImpl::slots_per_word,
2499                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2500                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2501                                             oop_maps);
2502 
2503   return nm;
2504 }
2505 
2506 // this function returns the adjust size (in number of words) to a c2i adapter
2507 // activation for use during deoptimization
2508 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2509   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2510 }
2511 
2512 
2513 uint SharedRuntime::out_preserve_stack_slots() {
2514   return 0;
2515 }
2516 
2517 
2518 // Number of stack slots between incoming argument block and the start of
2519 // a new frame.  The PROLOG must add this many slots to the stack.  The
2520 // EPILOG must remove this many slots.  amd64 needs two slots for
2521 // return address.
2522 uint SharedRuntime::in_preserve_stack_slots() {
2523   return 4 + 2 * VerifyStackAtCalls;
2524 }
2525 
2526 VMReg SharedRuntime::thread_register() {
2527   return r15_thread->as_VMReg();
2528 }
2529 
2530 //------------------------------generate_deopt_blob----------------------------
2531 void SharedRuntime::generate_deopt_blob() {
2532   // Allocate space for the code
2533   ResourceMark rm;
2534   // Setup code generation tools
2535   int pad = 0;
2536   if (UseAVX > 2) {
2537     pad += 1024;
2538   }
2539   if (UseAPX) {
2540     pad += 1024;
2541   }
2542 #if INCLUDE_JVMCI
2543   if (EnableJVMCI) {
2544     pad += 512; // Increase the buffer size when compiling for JVMCI
2545   }
2546 #endif
2547   const char* name = SharedRuntime::stub_name(StubId::shared_deopt_id);
2548   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2549   if (blob != nullptr) {
2550     _deopt_blob = blob->as_deoptimization_blob();
2551     return;
2552   }
2553 
2554   CodeBuffer buffer(name, 2560+pad, 1024);
2555   MacroAssembler* masm = new MacroAssembler(&buffer);
2556   int frame_size_in_words;
2557   OopMap* map = nullptr;
2558   OopMapSet *oop_maps = new OopMapSet();
2559 
2560   // -------------
2561   // This code enters when returning to a de-optimized nmethod.  A return
2562   // address has been pushed on the stack, and return values are in
2563   // registers.
2564   // If we are doing a normal deopt then we were called from the patched
2565   // nmethod from the point we returned to the nmethod. So the return
2566   // address on the stack is wrong by NativeCall::instruction_size
2567   // We will adjust the value so it looks like we have the original return
2568   // address on the stack (like when we eagerly deoptimized).
2569   // In the case of an exception pending when deoptimizing, we enter
2570   // with a return address on the stack that points after the call we patched
2571   // into the exception handler. We have the following register state from,
2572   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2573   //    rax: exception oop
2574   //    rbx: exception handler
2575   //    rdx: throwing pc
2576   // So in this case we simply jam rdx into the useless return address and
2577   // the stack looks just like we want.
2578   //
2579   // At this point we need to de-opt.  We save the argument return
2580   // registers.  We call the first C routine, fetch_unroll_info().  This
2581   // routine captures the return values and returns a structure which
2582   // describes the current frame size and the sizes of all replacement frames.
2583   // The current frame is compiled code and may contain many inlined
2584   // functions, each with their own JVM state.  We pop the current frame, then
2585   // push all the new frames.  Then we call the C routine unpack_frames() to
2586   // populate these frames.  Finally unpack_frames() returns us the new target
2587   // address.  Notice that callee-save registers are BLOWN here; they have
2588   // already been captured in the vframeArray at the time the return PC was
2589   // patched.
2590   address start = __ pc();
2591   Label cont;
2592 
2593   // Prolog for non exception case!
2594 
2595   // Save everything in sight.
2596   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2597 
2598   // Normal deoptimization.  Save exec mode for unpack_frames.
2599   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2600   __ jmp(cont);
2601 
2602   int reexecute_offset = __ pc() - start;
2603 #if INCLUDE_JVMCI && !defined(COMPILER1)
2604   if (UseJVMCICompiler) {
2605     // JVMCI does not use this kind of deoptimization
2606     __ should_not_reach_here();
2607   }
2608 #endif
2609 
2610   // Reexecute case
2611   // return address is the pc describes what bci to do re-execute at
2612 
2613   // No need to update map as each call to save_live_registers will produce identical oopmap
2614   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2615 
2616   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2617   __ jmp(cont);
2618 
2619 #if INCLUDE_JVMCI
2620   Label after_fetch_unroll_info_call;
2621   int implicit_exception_uncommon_trap_offset = 0;
2622   int uncommon_trap_offset = 0;
2623 
2624   if (EnableJVMCI) {
2625     implicit_exception_uncommon_trap_offset = __ pc() - start;
2626 
2627     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2628     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2629 
2630     uncommon_trap_offset = __ pc() - start;
2631 
2632     // Save everything in sight.
2633     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2634     // fetch_unroll_info needs to call last_java_frame()
2635     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2636 
2637     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2638     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2639 
2640     __ movl(r14, Deoptimization::Unpack_reexecute);
2641     __ mov(c_rarg0, r15_thread);
2642     __ movl(c_rarg2, r14); // exec mode
2643     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2644     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2645 
2646     __ reset_last_Java_frame(false);
2647 
2648     __ jmp(after_fetch_unroll_info_call);
2649   } // EnableJVMCI
2650 #endif // INCLUDE_JVMCI
2651 
2652   int exception_offset = __ pc() - start;
2653 
2654   // Prolog for exception case
2655 
2656   // all registers are dead at this entry point, except for rax, and
2657   // rdx which contain the exception oop and exception pc
2658   // respectively.  Set them in TLS and fall thru to the
2659   // unpack_with_exception_in_tls entry point.
2660 
2661   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2662   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2663 
2664   int exception_in_tls_offset = __ pc() - start;
2665 
2666   // new implementation because exception oop is now passed in JavaThread
2667 
2668   // Prolog for exception case
2669   // All registers must be preserved because they might be used by LinearScan
2670   // Exceptiop oop and throwing PC are passed in JavaThread
2671   // tos: stack at point of call to method that threw the exception (i.e. only
2672   // args are on the stack, no return address)
2673 
2674   // make room on stack for the return address
2675   // It will be patched later with the throwing pc. The correct value is not
2676   // available now because loading it from memory would destroy registers.
2677   __ push(0);
2678 
2679   // Save everything in sight.
2680   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2681 
2682   // Now it is safe to overwrite any register
2683 
2684   // Deopt during an exception.  Save exec mode for unpack_frames.
2685   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2686 
2687   // load throwing pc from JavaThread and patch it as the return address
2688   // of the current frame. Then clear the field in JavaThread
2689 
2690   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2691   __ movptr(Address(rbp, wordSize), rdx);
2692   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2693 
2694 #ifdef ASSERT
2695   // verify that there is really an exception oop in JavaThread
2696   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2697   __ verify_oop(rax);
2698 
2699   // verify that there is no pending exception
2700   Label no_pending_exception;
2701   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2702   __ testptr(rax, rax);
2703   __ jcc(Assembler::zero, no_pending_exception);
2704   __ stop("must not have pending exception here");
2705   __ bind(no_pending_exception);
2706 #endif
2707 
2708   __ bind(cont);
2709 
2710   // Call C code.  Need thread and this frame, but NOT official VM entry
2711   // crud.  We cannot block on this call, no GC can happen.
2712   //
2713   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2714 
2715   // fetch_unroll_info needs to call last_java_frame().
2716 
2717   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2718 #ifdef ASSERT
2719   { Label L;
2720     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2721     __ jcc(Assembler::equal, L);
2722     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2723     __ bind(L);
2724   }
2725 #endif // ASSERT
2726   __ mov(c_rarg0, r15_thread);
2727   __ movl(c_rarg1, r14); // exec_mode
2728   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2729 
2730   // Need to have an oopmap that tells fetch_unroll_info where to
2731   // find any register it might need.
2732   oop_maps->add_gc_map(__ pc() - start, map);
2733 
2734   __ reset_last_Java_frame(false);
2735 
2736 #if INCLUDE_JVMCI
2737   if (EnableJVMCI) {
2738     __ bind(after_fetch_unroll_info_call);
2739   }
2740 #endif
2741 
2742   // Load UnrollBlock* into rdi
2743   __ mov(rdi, rax);
2744 
2745   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2746    Label noException;
2747   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2748   __ jcc(Assembler::notEqual, noException);
2749   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2750   // QQQ this is useless it was null above
2751   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2752   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2753   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2754 
2755   __ verify_oop(rax);
2756 
2757   // Overwrite the result registers with the exception results.
2758   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2759   // I think this is useless
2760   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2761 
2762   __ bind(noException);
2763 
2764   // Only register save data is on the stack.
2765   // Now restore the result registers.  Everything else is either dead
2766   // or captured in the vframeArray.
2767   RegisterSaver::restore_result_registers(masm);
2768 
2769   // All of the register save area has been popped of the stack. Only the
2770   // return address remains.
2771 
2772   // Pop all the frames we must move/replace.
2773   //
2774   // Frame picture (youngest to oldest)
2775   // 1: self-frame (no frame link)
2776   // 2: deopting frame  (no frame link)
2777   // 3: caller of deopting frame (could be compiled/interpreted).
2778   //
2779   // Note: by leaving the return address of self-frame on the stack
2780   // and using the size of frame 2 to adjust the stack
2781   // when we are done the return to frame 3 will still be on the stack.
2782 
2783   // Pop deoptimized frame
2784   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2785   __ addptr(rsp, rcx);
2786 
2787   // rsp should be pointing at the return address to the caller (3)
2788 
2789   // Pick up the initial fp we should save
2790   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2791   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2792 
2793 #ifdef ASSERT
2794   // Compilers generate code that bang the stack by as much as the
2795   // interpreter would need. So this stack banging should never
2796   // trigger a fault. Verify that it does not on non product builds.
2797   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2798   __ bang_stack_size(rbx, rcx);
2799 #endif
2800 
2801   // Load address of array of frame pcs into rcx
2802   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2803 
2804   // Trash the old pc
2805   __ addptr(rsp, wordSize);
2806 
2807   // Load address of array of frame sizes into rsi
2808   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2809 
2810   // Load counter into rdx
2811   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2812 
2813   // Now adjust the caller's stack to make up for the extra locals
2814   // but record the original sp so that we can save it in the skeletal interpreter
2815   // frame and the stack walking of interpreter_sender will get the unextended sp
2816   // value and not the "real" sp value.
2817 
2818   const Register sender_sp = r8;
2819 
2820   __ mov(sender_sp, rsp);
2821   __ movl(rbx, Address(rdi,
2822                        Deoptimization::UnrollBlock::
2823                        caller_adjustment_offset()));
2824   __ subptr(rsp, rbx);
2825 
2826   // Push interpreter frames in a loop
2827   Label loop;
2828   __ bind(loop);
2829   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2830   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2831   __ pushptr(Address(rcx, 0));          // Save return address
2832   __ enter();                           // Save old & set new ebp
2833   __ subptr(rsp, rbx);                  // Prolog
2834   // This value is corrected by layout_activation_impl
2835   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2836   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2837   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2838   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2839   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2840   __ decrementl(rdx);                   // Decrement counter
2841   __ jcc(Assembler::notZero, loop);
2842   __ pushptr(Address(rcx, 0));          // Save final return address
2843 
2844   // Re-push self-frame
2845   __ enter();                           // Save old & set new ebp
2846 
2847   // Allocate a full sized register save area.
2848   // Return address and rbp are in place, so we allocate two less words.
2849   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2850 
2851   // Restore frame locals after moving the frame
2852   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2853   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2854 
2855   // Call C code.  Need thread but NOT official VM entry
2856   // crud.  We cannot block on this call, no GC can happen.  Call should
2857   // restore return values to their stack-slots with the new SP.
2858   //
2859   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2860 
2861   // Use rbp because the frames look interpreted now
2862   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2863   // Don't need the precise return PC here, just precise enough to point into this code blob.
2864   address the_pc = __ pc();
2865   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2866 
2867   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2868   __ mov(c_rarg0, r15_thread);
2869   __ movl(c_rarg1, r14); // second arg: exec_mode
2870   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2871   // Revert SP alignment after call since we're going to do some SP relative addressing below
2872   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2873 
2874   // Set an oopmap for the call site
2875   // Use the same PC we used for the last java frame
2876   oop_maps->add_gc_map(the_pc - start,
2877                        new OopMap( frame_size_in_words, 0 ));
2878 
2879   // Clear fp AND pc
2880   __ reset_last_Java_frame(true);
2881 
2882   // Collect return values
2883   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2884   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2885   // I think this is useless (throwing pc?)
2886   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2887 
2888   // Pop self-frame.
2889   __ leave();                           // Epilog
2890 
2891   // Jump to interpreter
2892   __ ret(0);
2893 
2894   // Make sure all code is generated
2895   masm->flush();
2896 
2897   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2898   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2899 #if INCLUDE_JVMCI
2900   if (EnableJVMCI) {
2901     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2902     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2903   }
2904 #endif
2905 
2906   AOTCodeCache::store_code_blob(*_deopt_blob, AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2907 }
2908 
2909 //------------------------------generate_handler_blob------
2910 //
2911 // Generate a special Compile2Runtime blob that saves all registers,
2912 // and setup oopmap.
2913 //
2914 SafepointBlob* SharedRuntime::generate_handler_blob(StubId id, address call_ptr) {
2915   assert(StubRoutines::forward_exception_entry() != nullptr,
2916          "must be generated before");
2917   assert(is_polling_page_id(id), "expected a polling page stub id");
2918 
2919   // Allocate space for the code.  Setup code generation tools.
2920   const char* name = SharedRuntime::stub_name(id);
2921   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
2922   if (blob != nullptr) {
2923     return blob->as_safepoint_blob();
2924   }
2925 
2926   ResourceMark rm;
2927   OopMapSet *oop_maps = new OopMapSet();
2928   OopMap* map;
2929   CodeBuffer buffer(name, 2548, 1024);
2930   MacroAssembler* masm = new MacroAssembler(&buffer);
2931 
2932   address start   = __ pc();
2933   address call_pc = nullptr;
2934   int frame_size_in_words;
2935   bool cause_return = (id == StubId::shared_polling_page_return_handler_id);
2936   bool save_wide_vectors = (id == StubId::shared_polling_page_vectors_safepoint_handler_id);
2937 
2938   // Make room for return address (or push it again)
2939   if (!cause_return) {
2940     __ push(rbx);
2941   }
2942 
2943   // Save registers, fpu state, and flags
2944   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
2945 
2946   // The following is basically a call_VM.  However, we need the precise
2947   // address of the call in order to generate an oopmap. Hence, we do all the
2948   // work ourselves.
2949 
2950   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
2951 
2952   // The return address must always be correct so that frame constructor never
2953   // sees an invalid pc.
2954 
2955   if (!cause_return) {
2956     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
2957     // Additionally, rbx is a callee saved register and we can look at it later to determine
2958     // if someone changed the return address for us!
2959     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
2960     __ movptr(Address(rbp, wordSize), rbx);
2961   }
2962 
2963   // Do the call
2964   __ mov(c_rarg0, r15_thread);
2965   __ call(RuntimeAddress(call_ptr));
2966 
2967   // Set an oopmap for the call site.  This oopmap will map all
2968   // oop-registers and debug-info registers as callee-saved.  This
2969   // will allow deoptimization at this safepoint to find all possible
2970   // debug-info recordings, as well as let GC find all oops.
2971 
2972   oop_maps->add_gc_map( __ pc() - start, map);
2973 
2974   Label noException;
2975 
2976   __ reset_last_Java_frame(false);
2977 
2978   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
2979   __ jcc(Assembler::equal, noException);
2980 
2981   // Exception pending
2982 
2983   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
2984 
2985   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2986 
2987   // No exception case
2988   __ bind(noException);
2989 
2990   Label no_adjust;
2991 #ifdef ASSERT
2992   Label bail;
2993 #endif
2994   if (!cause_return) {
2995     Label no_prefix, not_special, check_rex_prefix;
2996 
2997     // If our stashed return pc was modified by the runtime we avoid touching it
2998     __ cmpptr(rbx, Address(rbp, wordSize));
2999     __ jcc(Assembler::notEqual, no_adjust);
3000 
3001     // Skip over the poll instruction.
3002     // See NativeInstruction::is_safepoint_poll()
3003     // Possible encodings:
3004     //      85 00       test   %eax,(%rax)
3005     //      85 01       test   %eax,(%rcx)
3006     //      85 02       test   %eax,(%rdx)
3007     //      85 03       test   %eax,(%rbx)
3008     //      85 06       test   %eax,(%rsi)
3009     //      85 07       test   %eax,(%rdi)
3010     //
3011     //   41 85 00       test   %eax,(%r8)
3012     //   41 85 01       test   %eax,(%r9)
3013     //   41 85 02       test   %eax,(%r10)
3014     //   41 85 03       test   %eax,(%r11)
3015     //   41 85 06       test   %eax,(%r14)
3016     //   41 85 07       test   %eax,(%r15)
3017     //
3018     //      85 04 24    test   %eax,(%rsp)
3019     //   41 85 04 24    test   %eax,(%r12)
3020     //      85 45 00    test   %eax,0x0(%rbp)
3021     //   41 85 45 00    test   %eax,0x0(%r13)
3022     //
3023     // Notes:
3024     //  Format of legacy MAP0 test instruction:-
3025     //  [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
3026     //  o  For safepoint polling instruction "test %eax,(%rax)", encoding of first register
3027     //     operand and base register of memory operand is b/w [0-8), hence we do not require
3028     //     additional REX prefix where REX.B bit stores MSB bit of register encoding, which
3029     //     is why two bytes encoding is sufficient here.
3030     //  o  For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
3031     //     register of memory operand is 1000, thus we need additional REX prefix in this case,
3032     //     there by adding additional byte to instruction encoding.
3033     //  o  In case BASE register is one of the 32 extended GPR registers available only on targets
3034     //     supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
3035     //     most significant two bits of 5 bit register encoding.
3036 
3037     if (VM_Version::supports_apx_f()) {
3038       __ cmpb(Address(rbx, 0), Assembler::REX2);
3039       __ jccb(Assembler::notEqual, check_rex_prefix);
3040       __ addptr(rbx, 2);
3041       __ bind(check_rex_prefix);
3042     }
3043     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3044     __ jccb(Assembler::notEqual, no_prefix);
3045     __ addptr(rbx, 1);
3046     __ bind(no_prefix);
3047 #ifdef ASSERT
3048     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3049 #endif
3050     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3051     // r12/rsp 0x04
3052     // r13/rbp 0x05
3053     __ movzbq(rcx, Address(rbx, 1));
3054     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3055     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3056     __ cmpptr(rcx, 1);
3057     __ jccb(Assembler::above, not_special);
3058     __ addptr(rbx, 1);
3059     __ bind(not_special);
3060 #ifdef ASSERT
3061     // Verify the correct encoding of the poll we're about to skip.
3062     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3063     __ jcc(Assembler::notEqual, bail);
3064     // Mask out the modrm bits
3065     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3066     // rax encodes to 0, so if the bits are nonzero it's incorrect
3067     __ jcc(Assembler::notZero, bail);
3068 #endif
3069     // Adjust return pc forward to step over the safepoint poll instruction
3070     __ addptr(rbx, 2);
3071     __ movptr(Address(rbp, wordSize), rbx);
3072   }
3073 
3074   __ bind(no_adjust);
3075   // Normal exit, restore registers and exit.
3076   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3077   __ ret(0);
3078 
3079 #ifdef ASSERT
3080   __ bind(bail);
3081   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3082 #endif
3083 
3084   // Make sure all code is generated
3085   masm->flush();
3086 
3087   // Fill-out other meta info
3088   SafepointBlob* sp_blob = SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3089 
3090   AOTCodeCache::store_code_blob(*sp_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3091   return sp_blob;
3092 }
3093 
3094 //
3095 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3096 //
3097 // Generate a stub that calls into vm to find out the proper destination
3098 // of a java call. All the argument registers are live at this point
3099 // but since this is generic code we don't know what they are and the caller
3100 // must do any gc of the args.
3101 //
3102 RuntimeStub* SharedRuntime::generate_resolve_blob(StubId id, address destination) {
3103   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3104   assert(is_resolve_id(id), "expected a resolve stub id");
3105 
3106   const char* name = SharedRuntime::stub_name(id);
3107   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3108   if (blob != nullptr) {
3109     return blob->as_runtime_stub();
3110   }
3111 
3112   // allocate space for the code
3113   ResourceMark rm;
3114   CodeBuffer buffer(name, 1552, 512);
3115   MacroAssembler* masm = new MacroAssembler(&buffer);
3116 
3117   int frame_size_in_words;
3118 
3119   OopMapSet *oop_maps = new OopMapSet();
3120   OopMap* map = nullptr;
3121 
3122   int start = __ offset();
3123 
3124   // No need to save vector registers since they are caller-saved anyway.
3125   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3126 
3127   int frame_complete = __ offset();
3128 
3129   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3130 
3131   __ mov(c_rarg0, r15_thread);
3132 
3133   __ call(RuntimeAddress(destination));
3134 
3135 
3136   // Set an oopmap for the call site.
3137   // We need this not only for callee-saved registers, but also for volatile
3138   // registers that the compiler might be keeping live across a safepoint.
3139 
3140   oop_maps->add_gc_map( __ offset() - start, map);
3141 
3142   // rax contains the address we are going to jump to assuming no exception got installed
3143 
3144   // clear last_Java_sp
3145   __ reset_last_Java_frame(false);
3146   // check for pending exceptions
3147   Label pending;
3148   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3149   __ jcc(Assembler::notEqual, pending);
3150 
3151   // get the returned Method*
3152   __ get_vm_result_metadata(rbx);
3153   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3154 
3155   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3156 
3157   RegisterSaver::restore_live_registers(masm);
3158 
3159   // We are back to the original state on entry and ready to go.
3160 
3161   __ jmp(rax);
3162 
3163   // Pending exception after the safepoint
3164 
3165   __ bind(pending);
3166 
3167   RegisterSaver::restore_live_registers(masm);
3168 
3169   // exception pending => remove activation and forward to exception handler
3170 
3171   __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
3172 
3173   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3174   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3175 
3176   // -------------
3177   // make sure all code is generated
3178   masm->flush();
3179 
3180   // return the  blob
3181   // frame_size_words or bytes??
3182   RuntimeStub* rs_blob = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3183 
3184   AOTCodeCache::store_code_blob(*rs_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3185   return rs_blob;
3186 }
3187 
3188 // Continuation point for throwing of implicit exceptions that are
3189 // not handled in the current activation. Fabricates an exception
3190 // oop and initiates normal exception dispatching in this
3191 // frame. Since we need to preserve callee-saved values (currently
3192 // only for C2, but done for C1 as well) we need a callee-saved oop
3193 // map and therefore have to make these stubs into RuntimeStubs
3194 // rather than BufferBlobs.  If the compiler needs all registers to
3195 // be preserved between the fault point and the exception handler
3196 // then it must assume responsibility for that in
3197 // AbstractCompiler::continuation_for_implicit_null_exception or
3198 // continuation_for_implicit_division_by_zero_exception. All other
3199 // implicit exceptions (e.g., NullPointerException or
3200 // AbstractMethodError on entry) are either at call sites or
3201 // otherwise assume that stack unwinding will be initiated, so
3202 // caller saved registers were assumed volatile in the compiler.
3203 RuntimeStub* SharedRuntime::generate_throw_exception(StubId id, address runtime_entry) {
3204   assert(is_throw_id(id), "expected a throw stub id");
3205 
3206   const char* name = SharedRuntime::stub_name(id);
3207 
3208   // Information about frame layout at time of blocking runtime call.
3209   // Note that we only have to preserve callee-saved registers since
3210   // the compilers are responsible for supplying a continuation point
3211   // if they expect all registers to be preserved.
3212   enum layout {
3213     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3214     rbp_off2,
3215     return_off,
3216     return_off2,
3217     framesize // inclusive of return address
3218   };
3219 
3220   int insts_size = 512;
3221   int locs_size  = 64;
3222 
3223   const char* timer_msg = "SharedRuntime generate_throw_exception";
3224   TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3225 
3226   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3227   if (blob != nullptr) {
3228     return blob->as_runtime_stub();
3229   }
3230 
3231   ResourceMark rm;
3232   CodeBuffer code(name, insts_size, locs_size);
3233   OopMapSet* oop_maps  = new OopMapSet();
3234   MacroAssembler* masm = new MacroAssembler(&code);
3235 
3236   address start = __ pc();
3237 
3238   // This is an inlined and slightly modified version of call_VM
3239   // which has the ability to fetch the return PC out of
3240   // thread-local storage and also sets up last_Java_sp slightly
3241   // differently than the real call_VM
3242 
3243   __ enter(); // required for proper stackwalking of RuntimeStub frame
3244 
3245   assert(is_even(framesize/2), "sp not 16-byte aligned");
3246 
3247   // return address and rbp are already in place
3248   __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3249 
3250   int frame_complete = __ pc() - start;
3251 
3252   // Set up last_Java_sp and last_Java_fp
3253   address the_pc = __ pc();
3254   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3255   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3256 
3257   // Call runtime
3258   __ movptr(c_rarg0, r15_thread);
3259   BLOCK_COMMENT("call runtime_entry");
3260   __ call(RuntimeAddress(runtime_entry));
3261 
3262   // Generate oop map
3263   OopMap* map = new OopMap(framesize, 0);
3264 
3265   oop_maps->add_gc_map(the_pc - start, map);
3266 
3267   __ reset_last_Java_frame(true);
3268 
3269   __ leave(); // required for proper stackwalking of RuntimeStub frame
3270 
3271   // check for pending exceptions
3272 #ifdef ASSERT
3273   Label L;
3274   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3275   __ jcc(Assembler::notEqual, L);
3276   __ should_not_reach_here();
3277   __ bind(L);
3278 #endif // ASSERT
3279   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3280 
3281 
3282   // codeBlob framesize is in words (not VMRegImpl::slot_size)
3283   RuntimeStub* stub =
3284     RuntimeStub::new_runtime_stub(name,
3285                                   &code,
3286                                   frame_complete,
3287                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3288                                   oop_maps, false);
3289   AOTCodeCache::store_code_blob(*stub, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3290 
3291   return stub;
3292 }
3293 
3294 //------------------------------Montgomery multiplication------------------------
3295 //
3296 
3297 #ifndef _WINDOWS
3298 
3299 // Subtract 0:b from carry:a.  Return carry.
3300 static julong
3301 sub(julong a[], julong b[], julong carry, long len) {
3302   long long i = 0, cnt = len;
3303   julong tmp;
3304   asm volatile("clc; "
3305                "0: ; "
3306                "mov (%[b], %[i], 8), %[tmp]; "
3307                "sbb %[tmp], (%[a], %[i], 8); "
3308                "inc %[i]; dec %[cnt]; "
3309                "jne 0b; "
3310                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3311                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3312                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3313                : "memory");
3314   return tmp;
3315 }
3316 
3317 // Multiply (unsigned) Long A by Long B, accumulating the double-
3318 // length result into the accumulator formed of T0, T1, and T2.
3319 #define MACC(A, B, T0, T1, T2)                                  \
3320 do {                                                            \
3321   unsigned long hi, lo;                                         \
3322   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3323            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3324            : "r"(A), "a"(B) : "cc");                            \
3325  } while(0)
3326 
3327 // As above, but add twice the double-length result into the
3328 // accumulator.
3329 #define MACC2(A, B, T0, T1, T2)                                 \
3330 do {                                                            \
3331   unsigned long hi, lo;                                         \
3332   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3333            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3334            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3335            : "r"(A), "a"(B) : "cc");                            \
3336  } while(0)
3337 
3338 #else //_WINDOWS
3339 
3340 static julong
3341 sub(julong a[], julong b[], julong carry, long len) {
3342   long i;
3343   julong tmp;
3344   unsigned char c = 1;
3345   for (i = 0; i < len; i++) {
3346     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3347     a[i] = tmp;
3348   }
3349   c = _addcarry_u64(c, carry, ~0, &tmp);
3350   return tmp;
3351 }
3352 
3353 // Multiply (unsigned) Long A by Long B, accumulating the double-
3354 // length result into the accumulator formed of T0, T1, and T2.
3355 #define MACC(A, B, T0, T1, T2)                          \
3356 do {                                                    \
3357   julong hi, lo;                            \
3358   lo = _umul128(A, B, &hi);                             \
3359   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3360   c = _addcarry_u64(c, hi, T1, &T1);                    \
3361   _addcarry_u64(c, T2, 0, &T2);                         \
3362  } while(0)
3363 
3364 // As above, but add twice the double-length result into the
3365 // accumulator.
3366 #define MACC2(A, B, T0, T1, T2)                         \
3367 do {                                                    \
3368   julong hi, lo;                            \
3369   lo = _umul128(A, B, &hi);                             \
3370   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3371   c = _addcarry_u64(c, hi, T1, &T1);                    \
3372   _addcarry_u64(c, T2, 0, &T2);                         \
3373   c = _addcarry_u64(0, lo, T0, &T0);                    \
3374   c = _addcarry_u64(c, hi, T1, &T1);                    \
3375   _addcarry_u64(c, T2, 0, &T2);                         \
3376  } while(0)
3377 
3378 #endif //_WINDOWS
3379 
3380 // Fast Montgomery multiplication.  The derivation of the algorithm is
3381 // in  A Cryptographic Library for the Motorola DSP56000,
3382 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3383 
3384 static void NOINLINE
3385 montgomery_multiply(julong a[], julong b[], julong n[],
3386                     julong m[], julong inv, int len) {
3387   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3388   int i;
3389 
3390   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3391 
3392   for (i = 0; i < len; i++) {
3393     int j;
3394     for (j = 0; j < i; j++) {
3395       MACC(a[j], b[i-j], t0, t1, t2);
3396       MACC(m[j], n[i-j], t0, t1, t2);
3397     }
3398     MACC(a[i], b[0], t0, t1, t2);
3399     m[i] = t0 * inv;
3400     MACC(m[i], n[0], t0, t1, t2);
3401 
3402     assert(t0 == 0, "broken Montgomery multiply");
3403 
3404     t0 = t1; t1 = t2; t2 = 0;
3405   }
3406 
3407   for (i = len; i < 2*len; i++) {
3408     int j;
3409     for (j = i-len+1; j < len; j++) {
3410       MACC(a[j], b[i-j], t0, t1, t2);
3411       MACC(m[j], n[i-j], t0, t1, t2);
3412     }
3413     m[i-len] = t0;
3414     t0 = t1; t1 = t2; t2 = 0;
3415   }
3416 
3417   while (t0)
3418     t0 = sub(m, n, t0, len);
3419 }
3420 
3421 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3422 // multiplies so it should be up to 25% faster than Montgomery
3423 // multiplication.  However, its loop control is more complex and it
3424 // may actually run slower on some machines.
3425 
3426 static void NOINLINE
3427 montgomery_square(julong a[], julong n[],
3428                   julong m[], julong inv, int len) {
3429   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3430   int i;
3431 
3432   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3433 
3434   for (i = 0; i < len; i++) {
3435     int j;
3436     int end = (i+1)/2;
3437     for (j = 0; j < end; j++) {
3438       MACC2(a[j], a[i-j], t0, t1, t2);
3439       MACC(m[j], n[i-j], t0, t1, t2);
3440     }
3441     if ((i & 1) == 0) {
3442       MACC(a[j], a[j], t0, t1, t2);
3443     }
3444     for (; j < i; j++) {
3445       MACC(m[j], n[i-j], t0, t1, t2);
3446     }
3447     m[i] = t0 * inv;
3448     MACC(m[i], n[0], t0, t1, t2);
3449 
3450     assert(t0 == 0, "broken Montgomery square");
3451 
3452     t0 = t1; t1 = t2; t2 = 0;
3453   }
3454 
3455   for (i = len; i < 2*len; i++) {
3456     int start = i-len+1;
3457     int end = start + (len - start)/2;
3458     int j;
3459     for (j = start; j < end; j++) {
3460       MACC2(a[j], a[i-j], t0, t1, t2);
3461       MACC(m[j], n[i-j], t0, t1, t2);
3462     }
3463     if ((i & 1) == 0) {
3464       MACC(a[j], a[j], t0, t1, t2);
3465     }
3466     for (; j < len; j++) {
3467       MACC(m[j], n[i-j], t0, t1, t2);
3468     }
3469     m[i-len] = t0;
3470     t0 = t1; t1 = t2; t2 = 0;
3471   }
3472 
3473   while (t0)
3474     t0 = sub(m, n, t0, len);
3475 }
3476 
3477 // Swap words in a longword.
3478 static julong swap(julong x) {
3479   return (x << 32) | (x >> 32);
3480 }
3481 
3482 // Copy len longwords from s to d, word-swapping as we go.  The
3483 // destination array is reversed.
3484 static void reverse_words(julong *s, julong *d, int len) {
3485   d += len;
3486   while(len-- > 0) {
3487     d--;
3488     *d = swap(*s);
3489     s++;
3490   }
3491 }
3492 
3493 // The threshold at which squaring is advantageous was determined
3494 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3495 #define MONTGOMERY_SQUARING_THRESHOLD 64
3496 
3497 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3498                                         jint len, jlong inv,
3499                                         jint *m_ints) {
3500   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3501   int longwords = len/2;
3502 
3503   // Make very sure we don't use so much space that the stack might
3504   // overflow.  512 jints corresponds to an 16384-bit integer and
3505   // will use here a total of 8k bytes of stack space.
3506   int divisor = sizeof(julong) * 4;
3507   guarantee(longwords <= 8192 / divisor, "must be");
3508   int total_allocation = longwords * sizeof (julong) * 4;
3509   julong *scratch = (julong *)alloca(total_allocation);
3510 
3511   // Local scratch arrays
3512   julong
3513     *a = scratch + 0 * longwords,
3514     *b = scratch + 1 * longwords,
3515     *n = scratch + 2 * longwords,
3516     *m = scratch + 3 * longwords;
3517 
3518   reverse_words((julong *)a_ints, a, longwords);
3519   reverse_words((julong *)b_ints, b, longwords);
3520   reverse_words((julong *)n_ints, n, longwords);
3521 
3522   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3523 
3524   reverse_words(m, (julong *)m_ints, longwords);
3525 }
3526 
3527 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3528                                       jint len, jlong inv,
3529                                       jint *m_ints) {
3530   assert(len % 2 == 0, "array length in montgomery_square must be even");
3531   int longwords = len/2;
3532 
3533   // Make very sure we don't use so much space that the stack might
3534   // overflow.  512 jints corresponds to an 16384-bit integer and
3535   // will use here a total of 6k bytes of stack space.
3536   int divisor = sizeof(julong) * 3;
3537   guarantee(longwords <= (8192 / divisor), "must be");
3538   int total_allocation = longwords * sizeof (julong) * 3;
3539   julong *scratch = (julong *)alloca(total_allocation);
3540 
3541   // Local scratch arrays
3542   julong
3543     *a = scratch + 0 * longwords,
3544     *n = scratch + 1 * longwords,
3545     *m = scratch + 2 * longwords;
3546 
3547   reverse_words((julong *)a_ints, a, longwords);
3548   reverse_words((julong *)n_ints, n, longwords);
3549 
3550   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3551     ::montgomery_square(a, n, m, (julong)inv, longwords);
3552   } else {
3553     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3554   }
3555 
3556   reverse_words(m, (julong *)m_ints, longwords);
3557 }
3558 
3559 #if INCLUDE_JFR
3560 
3561 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3562 // It returns a jobject handle to the event writer.
3563 // The handle is dereferenced and the return value is the event writer oop.
3564 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3565   enum layout {
3566     rbp_off,
3567     rbpH_off,
3568     return_off,
3569     return_off2,
3570     framesize // inclusive of return address
3571   };
3572 
3573   const char* name = SharedRuntime::stub_name(StubId::shared_jfr_write_checkpoint_id);
3574   CodeBuffer code(name, 1024, 64);
3575   MacroAssembler* masm = new MacroAssembler(&code);
3576   address start = __ pc();
3577 
3578   __ enter();
3579   address the_pc = __ pc();
3580 
3581   int frame_complete = the_pc - start;
3582 
3583   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3584   __ movptr(c_rarg0, r15_thread);
3585   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3586   __ reset_last_Java_frame(true);
3587 
3588   // rax is jobject handle result, unpack and process it through a barrier.
3589   __ resolve_global_jobject(rax, c_rarg0);
3590 
3591   __ leave();
3592   __ ret(0);
3593 
3594   OopMapSet* oop_maps = new OopMapSet();
3595   OopMap* map = new OopMap(framesize, 1);
3596   oop_maps->add_gc_map(frame_complete, map);
3597 
3598   RuntimeStub* stub =
3599     RuntimeStub::new_runtime_stub(name,
3600                                   &code,
3601                                   frame_complete,
3602                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3603                                   oop_maps,
3604                                   false);
3605   return stub;
3606 }
3607 
3608 // For c2: call to return a leased buffer.
3609 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
3610   enum layout {
3611     rbp_off,
3612     rbpH_off,
3613     return_off,
3614     return_off2,
3615     framesize // inclusive of return address
3616   };
3617 
3618   const char* name = SharedRuntime::stub_name(StubId::shared_jfr_return_lease_id);
3619   CodeBuffer code(name, 1024, 64);
3620   MacroAssembler* masm = new MacroAssembler(&code);
3621   address start = __ pc();
3622 
3623   __ enter();
3624   address the_pc = __ pc();
3625 
3626   int frame_complete = the_pc - start;
3627 
3628   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
3629   __ movptr(c_rarg0, r15_thread);
3630   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
3631   __ reset_last_Java_frame(true);
3632 
3633   __ leave();
3634   __ ret(0);
3635 
3636   OopMapSet* oop_maps = new OopMapSet();
3637   OopMap* map = new OopMap(framesize, 1);
3638   oop_maps->add_gc_map(frame_complete, map);
3639 
3640   RuntimeStub* stub =
3641     RuntimeStub::new_runtime_stub(name,
3642                                   &code,
3643                                   frame_complete,
3644                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3645                                   oop_maps,
3646                                   false);
3647   return stub;
3648 }
3649 
3650 #endif // INCLUDE_JFR
3651