1 /*
   2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef _WINDOWS
  26 #include "alloca.h"
  27 #endif
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "code/aotCodeCache.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/nativeInst.hpp"
  34 #include "code/vtableStubs.hpp"
  35 #include "compiler/oopMap.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "gc/shared/gcLocker.hpp"
  38 #include "gc/shared/barrierSet.hpp"
  39 #include "gc/shared/barrierSetAssembler.hpp"
  40 #include "interpreter/interpreter.hpp"
  41 #include "logging/log.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "memory/universe.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/method.inline.hpp"
  46 #include "prims/methodHandles.hpp"
  47 #include "runtime/continuation.hpp"
  48 #include "runtime/continuationEntry.inline.hpp"
  49 #include "runtime/globals.hpp"
  50 #include "runtime/jniHandles.hpp"
  51 #include "runtime/safepointMechanism.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/signature.hpp"
  54 #include "runtime/stubRoutines.hpp"
  55 #include "runtime/timerTrace.hpp"
  56 #include "runtime/vframeArray.hpp"
  57 #include "runtime/vm_version.hpp"
  58 #include "utilities/align.hpp"
  59 #include "utilities/checkedCast.hpp"
  60 #include "utilities/formatBuffer.hpp"
  61 #include "vmreg_x86.inline.hpp"
  62 #ifdef COMPILER1
  63 #include "c1/c1_Runtime1.hpp"
  64 #endif
  65 #ifdef COMPILER2
  66 #include "opto/runtime.hpp"
  67 #endif
  68 #if INCLUDE_JVMCI
  69 #include "jvmci/jvmciJavaClasses.hpp"
  70 #endif
  71 
  72 #define __ masm->
  73 
  74 #ifdef PRODUCT
  75 #define BLOCK_COMMENT(str) /* nothing */
  76 #else
  77 #define BLOCK_COMMENT(str) __ block_comment(str)
  78 #endif // PRODUCT
  79 
  80 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  81 
  82 class RegisterSaver {
  83   // Capture info about frame layout.  Layout offsets are in jint
  84   // units because compiler frame slots are jints.
  85 #define XSAVE_AREA_BEGIN 160
  86 #define XSAVE_AREA_YMM_BEGIN 576
  87 #define XSAVE_AREA_EGPRS 960
  88 #define XSAVE_AREA_OPMASK_BEGIN 1088
  89 #define XSAVE_AREA_ZMM_BEGIN 1152
  90 #define XSAVE_AREA_UPPERBANK 1664
  91 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  92 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
  93 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
  94 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
  95 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
  96   enum layout {
  97     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt,    // fxsave save area
  98     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,  // offset in fxsave save area
  99     DEF_XMM_OFFS(0),
 100     DEF_XMM_OFFS(1),
 101     // 2..15 are implied in range usage
 102     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 103     DEF_YMM_OFFS(0),
 104     DEF_YMM_OFFS(1),
 105     r16_off = xmm_off + (XSAVE_AREA_EGPRS - XSAVE_AREA_BEGIN)/BytesPerInt,
 106     r16H_off,
 107     r17_off, r17H_off,
 108     r18_off, r18H_off,
 109     r19_off, r19H_off,
 110     r20_off, r20H_off,
 111     r21_off, r21H_off,
 112     r22_off, r22H_off,
 113     r23_off, r23H_off,
 114     r24_off, r24H_off,
 115     r25_off, r25H_off,
 116     r26_off, r26H_off,
 117     r27_off, r27H_off,
 118     r28_off, r28H_off,
 119     r29_off, r29H_off,
 120     r30_off, r30H_off,
 121     r31_off, r31H_off,
 122     opmask_off   = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 123     DEF_OPMASK_OFFS(0),
 124     DEF_OPMASK_OFFS(1),
 125     // 2..7 are implied in range usage
 126     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 127     DEF_ZMM_OFFS(0),
 128     DEF_ZMM_OFFS(1),
 129     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 130     DEF_ZMM_UPPER_OFFS(16),
 131     DEF_ZMM_UPPER_OFFS(17),
 132     // 18..31 are implied in range usage
 133     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 134     fpu_stateH_end,
 135     r15_off, r15H_off,
 136     r14_off, r14H_off,
 137     r13_off, r13H_off,
 138     r12_off, r12H_off,
 139     r11_off, r11H_off,
 140     r10_off, r10H_off,
 141     r9_off,  r9H_off,
 142     r8_off,  r8H_off,
 143     rdi_off, rdiH_off,
 144     rsi_off, rsiH_off,
 145     ignore_off, ignoreH_off,  // extra copy of rbp
 146     rsp_off, rspH_off,
 147     rbx_off, rbxH_off,
 148     rdx_off, rdxH_off,
 149     rcx_off, rcxH_off,
 150     rax_off, raxH_off,
 151     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 152     align_off, alignH_off,
 153     flags_off, flagsH_off,
 154     // The frame sender code expects that rbp will be in the "natural" place and
 155     // will override any oopMap setting for it. We must therefore force the layout
 156     // so that it agrees with the frame sender code.
 157     rbp_off, rbpH_off,        // copy of rbp we will restore
 158     return_off, returnH_off,  // slot for return address
 159     reg_save_size             // size in compiler stack slots
 160   };
 161 
 162  public:
 163   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 164   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 165 
 166   // Offsets into the register save area
 167   // Used by deoptimization when it is managing result register
 168   // values on its own
 169 
 170   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 171   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 172   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 173   static int r15_offset_in_bytes(void)    { return BytesPerInt * r15_off; }
 174   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 175   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 176 
 177   // During deoptimization only the result registers need to be restored,
 178   // all the other values have already been extracted.
 179   static void restore_result_registers(MacroAssembler* masm);
 180 };
 181 
 182 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 183   int off = 0;
 184   int num_xmm_regs = XMMRegister::available_xmm_registers();
 185 #if COMPILER2_OR_JVMCI
 186   if (save_wide_vectors && UseAVX == 0) {
 187     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 188   }
 189   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 190 #else
 191   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 192 #endif
 193 
 194   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 195   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 196   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 197   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 198   // CodeBlob frame size is in words.
 199   int frame_size_in_words = frame_size_in_bytes / wordSize;
 200   *total_frame_words = frame_size_in_words;
 201 
 202   // Save registers, fpu state, and flags.
 203   // We assume caller has already pushed the return address onto the
 204   // stack, so rsp is 8-byte aligned here.
 205   // We push rpb twice in this sequence because we want the real rbp
 206   // to be under the return like a normal enter.
 207 
 208   __ enter();          // rsp becomes 16-byte aligned here
 209   __ pushf();
 210   // Make sure rsp stays 16-byte aligned
 211   __ subq(rsp, 8);
 212   // Push CPU state in multiple of 16 bytes
 213   __ save_legacy_gprs();
 214   __ push_FPU_state();
 215 
 216 
 217   // push cpu state handles this on EVEX enabled targets
 218   if (save_wide_vectors) {
 219     // Save upper half of YMM registers(0..15)
 220     int base_addr = XSAVE_AREA_YMM_BEGIN;
 221     for (int n = 0; n < 16; n++) {
 222       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 223     }
 224     if (VM_Version::supports_evex()) {
 225       // Save upper half of ZMM registers(0..15)
 226       base_addr = XSAVE_AREA_ZMM_BEGIN;
 227       for (int n = 0; n < 16; n++) {
 228         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 229       }
 230       // Save full ZMM registers(16..num_xmm_regs)
 231       base_addr = XSAVE_AREA_UPPERBANK;
 232       off = 0;
 233       int vector_len = Assembler::AVX_512bit;
 234       for (int n = 16; n < num_xmm_regs; n++) {
 235         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 236       }
 237 #if COMPILER2_OR_JVMCI
 238       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 239       off = 0;
 240       for(int n = 0; n < KRegister::number_of_registers; n++) {
 241         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 242       }
 243 #endif
 244     }
 245   } else {
 246     if (VM_Version::supports_evex()) {
 247       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 248       int base_addr = XSAVE_AREA_UPPERBANK;
 249       off = 0;
 250       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 251       for (int n = 16; n < num_xmm_regs; n++) {
 252         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 253       }
 254 #if COMPILER2_OR_JVMCI
 255       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 256       off = 0;
 257       for(int n = 0; n < KRegister::number_of_registers; n++) {
 258         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 259       }
 260 #endif
 261     }
 262   }
 263 
 264 #if COMPILER2_OR_JVMCI
 265   if (UseAPX) {
 266       int base_addr = XSAVE_AREA_EGPRS;
 267       off = 0;
 268       for (int n = 16; n < Register::number_of_registers; n++) {
 269         __ movq(Address(rsp, base_addr+(off++*8)), as_Register(n));
 270       }
 271   }
 272 #endif
 273 
 274   __ vzeroupper();
 275   if (frame::arg_reg_save_area_bytes != 0) {
 276     // Allocate argument register save area
 277     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 278   }
 279 
 280   // Set an oopmap for the call site.  This oopmap will map all
 281   // oop-registers and debug-info registers as callee-saved.  This
 282   // will allow deoptimization at this safepoint to find all possible
 283   // debug-info recordings, as well as let GC find all oops.
 284 
 285   OopMapSet *oop_maps = new OopMapSet();
 286   OopMap* map = new OopMap(frame_size_in_slots, 0);
 287 
 288 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 289 
 290   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 291   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 292   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 293   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 294   // rbp location is known implicitly by the frame sender code, needs no oopmap
 295   // and the location where rbp was saved by is ignored
 296   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 297   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 298   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 299   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 300   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 301   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 302   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 303   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 304   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 305   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 306 
 307   if (UseAPX) {
 308     map->set_callee_saved(STACK_OFFSET( r16_off ), r16->as_VMReg());
 309     map->set_callee_saved(STACK_OFFSET( r17_off ), r17->as_VMReg());
 310     map->set_callee_saved(STACK_OFFSET( r18_off ), r18->as_VMReg());
 311     map->set_callee_saved(STACK_OFFSET( r19_off ), r19->as_VMReg());
 312     map->set_callee_saved(STACK_OFFSET( r20_off ), r20->as_VMReg());
 313     map->set_callee_saved(STACK_OFFSET( r21_off ), r21->as_VMReg());
 314     map->set_callee_saved(STACK_OFFSET( r22_off ), r22->as_VMReg());
 315     map->set_callee_saved(STACK_OFFSET( r23_off ), r23->as_VMReg());
 316     map->set_callee_saved(STACK_OFFSET( r24_off ), r24->as_VMReg());
 317     map->set_callee_saved(STACK_OFFSET( r25_off ), r25->as_VMReg());
 318     map->set_callee_saved(STACK_OFFSET( r26_off ), r26->as_VMReg());
 319     map->set_callee_saved(STACK_OFFSET( r27_off ), r27->as_VMReg());
 320     map->set_callee_saved(STACK_OFFSET( r28_off ), r28->as_VMReg());
 321     map->set_callee_saved(STACK_OFFSET( r29_off ), r29->as_VMReg());
 322     map->set_callee_saved(STACK_OFFSET( r30_off ), r30->as_VMReg());
 323     map->set_callee_saved(STACK_OFFSET( r31_off ), r31->as_VMReg());
 324   }
 325   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 326   // on EVEX enabled targets, we get it included in the xsave area
 327   off = xmm0_off;
 328   int delta = xmm1_off - off;
 329   for (int n = 0; n < 16; n++) {
 330     XMMRegister xmm_name = as_XMMRegister(n);
 331     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 332     off += delta;
 333   }
 334   if (UseAVX > 2) {
 335     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 336     off = zmm16_off;
 337     delta = zmm17_off - off;
 338     for (int n = 16; n < num_xmm_regs; n++) {
 339       XMMRegister zmm_name = as_XMMRegister(n);
 340       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 341       off += delta;
 342     }
 343   }
 344 
 345 #if COMPILER2_OR_JVMCI
 346   if (save_wide_vectors) {
 347     // Save upper half of YMM registers(0..15)
 348     off = ymm0_off;
 349     delta = ymm1_off - ymm0_off;
 350     for (int n = 0; n < 16; n++) {
 351       XMMRegister ymm_name = as_XMMRegister(n);
 352       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 353       off += delta;
 354     }
 355     if (VM_Version::supports_evex()) {
 356       // Save upper half of ZMM registers(0..15)
 357       off = zmm0_off;
 358       delta = zmm1_off - zmm0_off;
 359       for (int n = 0; n < 16; n++) {
 360         XMMRegister zmm_name = as_XMMRegister(n);
 361         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 362         off += delta;
 363       }
 364     }
 365   }
 366 #endif // COMPILER2_OR_JVMCI
 367 
 368   // %%% These should all be a waste but we'll keep things as they were for now
 369   if (true) {
 370     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 371     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 372     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 373     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 374     // rbp location is known implicitly by the frame sender code, needs no oopmap
 375     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 376     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 377     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 378     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 379     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 380     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 381     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 382     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 383     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 384     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 385     if (UseAPX) {
 386       map->set_callee_saved(STACK_OFFSET( r16H_off ), r16->as_VMReg()->next());
 387       map->set_callee_saved(STACK_OFFSET( r17H_off ), r17->as_VMReg()->next());
 388       map->set_callee_saved(STACK_OFFSET( r18H_off ), r18->as_VMReg()->next());
 389       map->set_callee_saved(STACK_OFFSET( r19H_off ), r19->as_VMReg()->next());
 390       map->set_callee_saved(STACK_OFFSET( r20H_off ), r20->as_VMReg()->next());
 391       map->set_callee_saved(STACK_OFFSET( r21H_off ), r21->as_VMReg()->next());
 392       map->set_callee_saved(STACK_OFFSET( r22H_off ), r22->as_VMReg()->next());
 393       map->set_callee_saved(STACK_OFFSET( r23H_off ), r23->as_VMReg()->next());
 394       map->set_callee_saved(STACK_OFFSET( r24H_off ), r24->as_VMReg()->next());
 395       map->set_callee_saved(STACK_OFFSET( r25H_off ), r25->as_VMReg()->next());
 396       map->set_callee_saved(STACK_OFFSET( r26H_off ), r26->as_VMReg()->next());
 397       map->set_callee_saved(STACK_OFFSET( r27H_off ), r27->as_VMReg()->next());
 398       map->set_callee_saved(STACK_OFFSET( r28H_off ), r28->as_VMReg()->next());
 399       map->set_callee_saved(STACK_OFFSET( r29H_off ), r29->as_VMReg()->next());
 400       map->set_callee_saved(STACK_OFFSET( r30H_off ), r30->as_VMReg()->next());
 401       map->set_callee_saved(STACK_OFFSET( r31H_off ), r31->as_VMReg()->next());
 402     }
 403     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 404     // on EVEX enabled targets, we get it included in the xsave area
 405     off = xmm0H_off;
 406     delta = xmm1H_off - off;
 407     for (int n = 0; n < 16; n++) {
 408       XMMRegister xmm_name = as_XMMRegister(n);
 409       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 410       off += delta;
 411     }
 412     if (UseAVX > 2) {
 413       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 414       off = zmm16H_off;
 415       delta = zmm17H_off - off;
 416       for (int n = 16; n < num_xmm_regs; n++) {
 417         XMMRegister zmm_name = as_XMMRegister(n);
 418         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 419         off += delta;
 420       }
 421     }
 422   }
 423 
 424   return map;
 425 }
 426 
 427 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 428   int num_xmm_regs = XMMRegister::available_xmm_registers();
 429   if (frame::arg_reg_save_area_bytes != 0) {
 430     // Pop arg register save area
 431     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 432   }
 433 
 434 #if COMPILER2_OR_JVMCI
 435   if (restore_wide_vectors) {
 436     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 437     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 438   }
 439 #else
 440   assert(!restore_wide_vectors, "vectors are generated only by C2");
 441 #endif
 442 
 443   __ vzeroupper();
 444 
 445   // On EVEX enabled targets everything is handled in pop fpu state
 446   if (restore_wide_vectors) {
 447     // Restore upper half of YMM registers (0..15)
 448     int base_addr = XSAVE_AREA_YMM_BEGIN;
 449     for (int n = 0; n < 16; n++) {
 450       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 451     }
 452     if (VM_Version::supports_evex()) {
 453       // Restore upper half of ZMM registers (0..15)
 454       base_addr = XSAVE_AREA_ZMM_BEGIN;
 455       for (int n = 0; n < 16; n++) {
 456         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 457       }
 458       // Restore full ZMM registers(16..num_xmm_regs)
 459       base_addr = XSAVE_AREA_UPPERBANK;
 460       int vector_len = Assembler::AVX_512bit;
 461       int off = 0;
 462       for (int n = 16; n < num_xmm_regs; n++) {
 463         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 464       }
 465 #if COMPILER2_OR_JVMCI
 466       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 467       off = 0;
 468       for (int n = 0; n < KRegister::number_of_registers; n++) {
 469         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 470       }
 471 #endif
 472     }
 473   } else {
 474     if (VM_Version::supports_evex()) {
 475       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 476       int base_addr = XSAVE_AREA_UPPERBANK;
 477       int off = 0;
 478       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 479       for (int n = 16; n < num_xmm_regs; n++) {
 480         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 481       }
 482 #if COMPILER2_OR_JVMCI
 483       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 484       off = 0;
 485       for (int n = 0; n < KRegister::number_of_registers; n++) {
 486         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 487       }
 488 #endif
 489     }
 490   }
 491 
 492 #if COMPILER2_OR_JVMCI
 493   if (UseAPX) {
 494     int base_addr = XSAVE_AREA_EGPRS;
 495     int off = 0;
 496     for (int n = 16; n < Register::number_of_registers; n++) {
 497       __ movq(as_Register(n), Address(rsp, base_addr+(off++*8)));
 498     }
 499   }
 500 #endif
 501 
 502   // Recover CPU state
 503   __ pop_FPU_state();
 504   __ restore_legacy_gprs();
 505   __ addq(rsp, 8);
 506   __ popf();
 507   // Get the rbp described implicitly by the calling convention (no oopMap)
 508   __ pop(rbp);
 509 }
 510 
 511 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 512 
 513   // Just restore result register. Only used by deoptimization. By
 514   // now any callee save register that needs to be restored to a c2
 515   // caller of the deoptee has been extracted into the vframeArray
 516   // and will be stuffed into the c2i adapter we create for later
 517   // restoration so only result registers need to be restored here.
 518 
 519   // Restore fp result register
 520   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 521   // Restore integer result register
 522   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 523   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 524 
 525   // Pop all of the register save are off the stack except the return address
 526   __ addptr(rsp, return_offset_in_bytes());
 527 }
 528 
 529 // Is vector's size (in bytes) bigger than a size saved by default?
 530 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 531 bool SharedRuntime::is_wide_vector(int size) {
 532   return size > 16;
 533 }
 534 
 535 // ---------------------------------------------------------------------------
 536 // Read the array of BasicTypes from a signature, and compute where the
 537 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 538 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 539 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 540 // as framesizes are fixed.
 541 // VMRegImpl::stack0 refers to the first slot 0(sp).
 542 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 543 // Register up to Register::number_of_registers are the 64-bit
 544 // integer registers.
 545 
 546 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 547 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 548 // units regardless of build. Of course for i486 there is no 64 bit build
 549 
 550 // The Java calling convention is a "shifted" version of the C ABI.
 551 // By skipping the first C ABI register we can call non-static jni methods
 552 // with small numbers of arguments without having to shuffle the arguments
 553 // at all. Since we control the java ABI we ought to at least get some
 554 // advantage out of it.
 555 
 556 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 557                                            VMRegPair *regs,
 558                                            int total_args_passed) {
 559 
 560   // Create the mapping between argument positions and
 561   // registers.
 562   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 563     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 564   };
 565   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 566     j_farg0, j_farg1, j_farg2, j_farg3,
 567     j_farg4, j_farg5, j_farg6, j_farg7
 568   };
 569 
 570 
 571   uint int_args = 0;
 572   uint fp_args = 0;
 573   uint stk_args = 0;
 574 
 575   for (int i = 0; i < total_args_passed; i++) {
 576     switch (sig_bt[i]) {
 577     case T_BOOLEAN:
 578     case T_CHAR:
 579     case T_BYTE:
 580     case T_SHORT:
 581     case T_INT:
 582       if (int_args < Argument::n_int_register_parameters_j) {
 583         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 584       } else {
 585         stk_args = align_up(stk_args, 2);
 586         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 587         stk_args += 1;
 588       }
 589       break;
 590     case T_VOID:
 591       // halves of T_LONG or T_DOUBLE
 592       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 593       regs[i].set_bad();
 594       break;
 595     case T_LONG:
 596       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 597       // fall through
 598     case T_OBJECT:
 599     case T_ARRAY:
 600     case T_ADDRESS:
 601       if (int_args < Argument::n_int_register_parameters_j) {
 602         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 603       } else {
 604         stk_args = align_up(stk_args, 2);
 605         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 606         stk_args += 2;
 607       }
 608       break;
 609     case T_FLOAT:
 610       if (fp_args < Argument::n_float_register_parameters_j) {
 611         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 612       } else {
 613         stk_args = align_up(stk_args, 2);
 614         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 615         stk_args += 1;
 616       }
 617       break;
 618     case T_DOUBLE:
 619       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 620       if (fp_args < Argument::n_float_register_parameters_j) {
 621         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 622       } else {
 623         stk_args = align_up(stk_args, 2);
 624         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 625         stk_args += 2;
 626       }
 627       break;
 628     default:
 629       ShouldNotReachHere();
 630       break;
 631     }
 632   }
 633 
 634   return stk_args;
 635 }
 636 
 637 // Patch the callers callsite with entry to compiled code if it exists.
 638 static void patch_callers_callsite(MacroAssembler *masm) {
 639   Label L;
 640   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 641   __ jcc(Assembler::equal, L);
 642 
 643   // Save the current stack pointer
 644   __ mov(r13, rsp);
 645   // Schedule the branch target address early.
 646   // Call into the VM to patch the caller, then jump to compiled callee
 647   // rax isn't live so capture return address while we easily can
 648   __ movptr(rax, Address(rsp, 0));
 649 
 650   // align stack so push_CPU_state doesn't fault
 651   __ andptr(rsp, -(StackAlignmentInBytes));
 652   __ push_CPU_state();
 653   __ vzeroupper();
 654   // VM needs caller's callsite
 655   // VM needs target method
 656   // This needs to be a long call since we will relocate this adapter to
 657   // the codeBuffer and it may not reach
 658 
 659   // Allocate argument register save area
 660   if (frame::arg_reg_save_area_bytes != 0) {
 661     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 662   }
 663   __ mov(c_rarg0, rbx);
 664   __ mov(c_rarg1, rax);
 665   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 666 
 667   // De-allocate argument register save area
 668   if (frame::arg_reg_save_area_bytes != 0) {
 669     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 670   }
 671 
 672   __ vzeroupper();
 673   __ pop_CPU_state();
 674   // restore sp
 675   __ mov(rsp, r13);
 676   __ bind(L);
 677 }
 678 
 679 static void gen_c2i_adapter(MacroAssembler *masm,
 680                             int total_args_passed,
 681                             int comp_args_on_stack,
 682                             const BasicType *sig_bt,
 683                             const VMRegPair *regs,
 684                             Label& skip_fixup) {
 685   // Before we get into the guts of the C2I adapter, see if we should be here
 686   // at all.  We've come from compiled code and are attempting to jump to the
 687   // interpreter, which means the caller made a static call to get here
 688   // (vcalls always get a compiled target if there is one).  Check for a
 689   // compiled target.  If there is one, we need to patch the caller's call.
 690   patch_callers_callsite(masm);
 691 
 692   __ bind(skip_fixup);
 693 
 694   // Since all args are passed on the stack, total_args_passed *
 695   // Interpreter::stackElementSize is the space we need.
 696 
 697   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 698 
 699   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 700 
 701   // stack is aligned, keep it that way
 702   // This is not currently needed or enforced by the interpreter, but
 703   // we might as well conform to the ABI.
 704   extraspace = align_up(extraspace, 2*wordSize);
 705 
 706   // set senderSP value
 707   __ lea(r13, Address(rsp, wordSize));
 708 
 709 #ifdef ASSERT
 710   __ check_stack_alignment(r13, "sender stack not aligned");
 711 #endif
 712   if (extraspace > 0) {
 713     // Pop the return address
 714     __ pop(rax);
 715 
 716     __ subptr(rsp, extraspace);
 717 
 718     // Push the return address
 719     __ push(rax);
 720 
 721     // Account for the return address location since we store it first rather
 722     // than hold it in a register across all the shuffling
 723     extraspace += wordSize;
 724   }
 725 
 726 #ifdef ASSERT
 727   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 728 #endif
 729 
 730   // Now write the args into the outgoing interpreter space
 731   for (int i = 0; i < total_args_passed; i++) {
 732     if (sig_bt[i] == T_VOID) {
 733       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 734       continue;
 735     }
 736 
 737     // offset to start parameters
 738     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 739     int next_off = st_off - Interpreter::stackElementSize;
 740 
 741     // Say 4 args:
 742     // i   st_off
 743     // 0   32 T_LONG
 744     // 1   24 T_VOID
 745     // 2   16 T_OBJECT
 746     // 3    8 T_BOOL
 747     // -    0 return address
 748     //
 749     // However to make thing extra confusing. Because we can fit a long/double in
 750     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 751     // leaves one slot empty and only stores to a single slot. In this case the
 752     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 753 
 754     VMReg r_1 = regs[i].first();
 755     VMReg r_2 = regs[i].second();
 756     if (!r_1->is_valid()) {
 757       assert(!r_2->is_valid(), "");
 758       continue;
 759     }
 760     if (r_1->is_stack()) {
 761       // memory to memory use rax
 762       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 763       if (!r_2->is_valid()) {
 764         // sign extend??
 765         __ movl(rax, Address(rsp, ld_off));
 766         __ movptr(Address(rsp, st_off), rax);
 767 
 768       } else {
 769 
 770         __ movq(rax, Address(rsp, ld_off));
 771 
 772         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 773         // T_DOUBLE and T_LONG use two slots in the interpreter
 774         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 775           // ld_off == LSW, ld_off+wordSize == MSW
 776           // st_off == MSW, next_off == LSW
 777           __ movq(Address(rsp, next_off), rax);
 778 #ifdef ASSERT
 779           // Overwrite the unused slot with known junk
 780           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 781           __ movptr(Address(rsp, st_off), rax);
 782 #endif /* ASSERT */
 783         } else {
 784           __ movq(Address(rsp, st_off), rax);
 785         }
 786       }
 787     } else if (r_1->is_Register()) {
 788       Register r = r_1->as_Register();
 789       if (!r_2->is_valid()) {
 790         // must be only an int (or less ) so move only 32bits to slot
 791         // why not sign extend??
 792         __ movl(Address(rsp, st_off), r);
 793       } else {
 794         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 795         // T_DOUBLE and T_LONG use two slots in the interpreter
 796         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 797           // long/double in gpr
 798 #ifdef ASSERT
 799           // Overwrite the unused slot with known junk
 800           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 801           __ movptr(Address(rsp, st_off), rax);
 802 #endif /* ASSERT */
 803           __ movq(Address(rsp, next_off), r);
 804         } else {
 805           __ movptr(Address(rsp, st_off), r);
 806         }
 807       }
 808     } else {
 809       assert(r_1->is_XMMRegister(), "");
 810       if (!r_2->is_valid()) {
 811         // only a float use just part of the slot
 812         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 813       } else {
 814 #ifdef ASSERT
 815         // Overwrite the unused slot with known junk
 816         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 817         __ movptr(Address(rsp, st_off), rax);
 818 #endif /* ASSERT */
 819         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 820       }
 821     }
 822   }
 823 
 824   // Schedule the branch target address early.
 825   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 826   __ jmp(rcx);
 827 }
 828 
 829 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 830                                     int total_args_passed,
 831                                     int comp_args_on_stack,
 832                                     const BasicType *sig_bt,
 833                                     const VMRegPair *regs) {
 834 
 835   // Note: r13 contains the senderSP on entry. We must preserve it since
 836   // we may do a i2c -> c2i transition if we lose a race where compiled
 837   // code goes non-entrant while we get args ready.
 838   // In addition we use r13 to locate all the interpreter args as
 839   // we must align the stack to 16 bytes on an i2c entry else we
 840   // lose alignment we expect in all compiled code and register
 841   // save code can segv when fxsave instructions find improperly
 842   // aligned stack pointer.
 843 
 844   // Adapters can be frameless because they do not require the caller
 845   // to perform additional cleanup work, such as correcting the stack pointer.
 846   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 847   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 848   // even if a callee has modified the stack pointer.
 849   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 850   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 851   // up via the senderSP register).
 852   // In other words, if *either* the caller or callee is interpreted, we can
 853   // get the stack pointer repaired after a call.
 854   // This is why c2i and i2c adapters cannot be indefinitely composed.
 855   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 856   // both caller and callee would be compiled methods, and neither would
 857   // clean up the stack pointer changes performed by the two adapters.
 858   // If this happens, control eventually transfers back to the compiled
 859   // caller, but with an uncorrected stack, causing delayed havoc.
 860 
 861   // Must preserve original SP for loading incoming arguments because
 862   // we need to align the outgoing SP for compiled code.
 863   __ movptr(r11, rsp);
 864 
 865   // Pick up the return address
 866   __ pop(rax);
 867 
 868   // Convert 4-byte c2 stack slots to words.
 869   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 870 
 871   if (comp_args_on_stack) {
 872     __ subptr(rsp, comp_words_on_stack * wordSize);
 873   }
 874 
 875   // Ensure compiled code always sees stack at proper alignment
 876   __ andptr(rsp, -16);
 877 
 878   // push the return address and misalign the stack that youngest frame always sees
 879   // as far as the placement of the call instruction
 880   __ push(rax);
 881 
 882   // Put saved SP in another register
 883   const Register saved_sp = rax;
 884   __ movptr(saved_sp, r11);
 885 
 886   // Will jump to the compiled code just as if compiled code was doing it.
 887   // Pre-load the register-jump target early, to schedule it better.
 888   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 889 
 890 #if INCLUDE_JVMCI
 891   if (EnableJVMCI) {
 892     // check if this call should be routed towards a specific entry point
 893     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 894     Label no_alternative_target;
 895     __ jcc(Assembler::equal, no_alternative_target);
 896     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 897     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 898     __ bind(no_alternative_target);
 899   }
 900 #endif // INCLUDE_JVMCI
 901 
 902   // Now generate the shuffle code.  Pick up all register args and move the
 903   // rest through the floating point stack top.
 904   for (int i = 0; i < total_args_passed; i++) {
 905     if (sig_bt[i] == T_VOID) {
 906       // Longs and doubles are passed in native word order, but misaligned
 907       // in the 32-bit build.
 908       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 909       continue;
 910     }
 911 
 912     // Pick up 0, 1 or 2 words from SP+offset.
 913 
 914     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 915             "scrambled load targets?");
 916     // Load in argument order going down.
 917     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 918     // Point to interpreter value (vs. tag)
 919     int next_off = ld_off - Interpreter::stackElementSize;
 920     //
 921     //
 922     //
 923     VMReg r_1 = regs[i].first();
 924     VMReg r_2 = regs[i].second();
 925     if (!r_1->is_valid()) {
 926       assert(!r_2->is_valid(), "");
 927       continue;
 928     }
 929     if (r_1->is_stack()) {
 930       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 931       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 932 
 933       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 934       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 935       // will be generated.
 936       if (!r_2->is_valid()) {
 937         // sign extend???
 938         __ movl(r13, Address(saved_sp, ld_off));
 939         __ movptr(Address(rsp, st_off), r13);
 940       } else {
 941         //
 942         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 943         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 944         // So we must adjust where to pick up the data to match the interpreter.
 945         //
 946         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 947         // are accessed as negative so LSW is at LOW address
 948 
 949         // ld_off is MSW so get LSW
 950         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 951                            next_off : ld_off;
 952         __ movq(r13, Address(saved_sp, offset));
 953         // st_off is LSW (i.e. reg.first())
 954         __ movq(Address(rsp, st_off), r13);
 955       }
 956     } else if (r_1->is_Register()) {  // Register argument
 957       Register r = r_1->as_Register();
 958       assert(r != rax, "must be different");
 959       if (r_2->is_valid()) {
 960         //
 961         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 962         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 963         // So we must adjust where to pick up the data to match the interpreter.
 964 
 965         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 966                            next_off : ld_off;
 967 
 968         // this can be a misaligned move
 969         __ movq(r, Address(saved_sp, offset));
 970       } else {
 971         // sign extend and use a full word?
 972         __ movl(r, Address(saved_sp, ld_off));
 973       }
 974     } else {
 975       if (!r_2->is_valid()) {
 976         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 977       } else {
 978         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 979       }
 980     }
 981   }
 982 
 983   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
 984 
 985   // 6243940 We might end up in handle_wrong_method if
 986   // the callee is deoptimized as we race thru here. If that
 987   // happens we don't want to take a safepoint because the
 988   // caller frame will look interpreted and arguments are now
 989   // "compiled" so it is much better to make this transition
 990   // invisible to the stack walking code. Unfortunately if
 991   // we try and find the callee by normal means a safepoint
 992   // is possible. So we stash the desired callee in the thread
 993   // and the vm will find there should this case occur.
 994 
 995   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 996 
 997   // put Method* where a c2i would expect should we end up there
 998   // only needed because eof c2 resolve stubs return Method* as a result in
 999   // rax
1000   __ mov(rax, rbx);
1001   __ jmp(r11);
1002 }
1003 
1004 // ---------------------------------------------------------------
1005 void SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
1006                                             int total_args_passed,
1007                                             int comp_args_on_stack,
1008                                             const BasicType *sig_bt,
1009                                             const VMRegPair *regs,
1010                                             address entry_address[AdapterBlob::ENTRY_COUNT]) {
1011   entry_address[AdapterBlob::I2C] = __ pc();
1012 
1013   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
1014 
1015   // -------------------------------------------------------------------------
1016   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1017   // to the interpreter.  The args start out packed in the compiled layout.  They
1018   // need to be unpacked into the interpreter layout.  This will almost always
1019   // require some stack space.  We grow the current (compiled) stack, then repack
1020   // the args.  We  finally end in a jump to the generic interpreter entry point.
1021   // On exit from the interpreter, the interpreter will restore our SP (lest the
1022   // compiled code, which relies solely on SP and not RBP, get sick).
1023 
1024   entry_address[AdapterBlob::C2I_Unverified] = __ pc();
1025   Label skip_fixup;
1026 
1027   Register data = rax;
1028   Register receiver = j_rarg0;
1029   Register temp = rbx;
1030 
1031   {
1032     __ ic_check(1 /* end_alignment */);
1033     __ movptr(rbx, Address(data, CompiledICData::speculated_method_offset()));
1034     // Method might have been compiled since the call site was patched to
1035     // interpreted if that is the case treat it as a miss so we can get
1036     // the call site corrected.
1037     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1038     __ jcc(Assembler::equal, skip_fixup);
1039     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1040   }
1041 
1042   entry_address[AdapterBlob::C2I] = __ pc();
1043 
1044   // Class initialization barrier for static methods
1045   entry_address[AdapterBlob::C2I_No_Clinit_Check] = nullptr;
1046   if (VM_Version::supports_fast_class_init_checks()) {
1047     Label L_skip_barrier;
1048     Register method = rbx;
1049 
1050     { // Bypass the barrier for non-static methods
1051       Register flags = rscratch1;
1052       __ load_unsigned_short(flags, Address(method, Method::access_flags_offset()));
1053       __ testl(flags, JVM_ACC_STATIC);
1054       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1055     }
1056 
1057     Register klass = rscratch1;
1058     __ load_method_holder(klass, method);
1059     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1060 
1061     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1062 
1063     __ bind(L_skip_barrier);
1064     entry_address[AdapterBlob::C2I_No_Clinit_Check] = __ pc();
1065   }
1066 
1067   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1068   bs->c2i_entry_barrier(masm);
1069 
1070   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1071   return;
1072 }
1073 
1074 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1075                                          VMRegPair *regs,
1076                                          int total_args_passed) {
1077 
1078 // We return the amount of VMRegImpl stack slots we need to reserve for all
1079 // the arguments NOT counting out_preserve_stack_slots.
1080 
1081 // NOTE: These arrays will have to change when c1 is ported
1082 #ifdef _WIN64
1083     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1084       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1085     };
1086     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1087       c_farg0, c_farg1, c_farg2, c_farg3
1088     };
1089 #else
1090     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1091       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1092     };
1093     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1094       c_farg0, c_farg1, c_farg2, c_farg3,
1095       c_farg4, c_farg5, c_farg6, c_farg7
1096     };
1097 #endif // _WIN64
1098 
1099 
1100     uint int_args = 0;
1101     uint fp_args = 0;
1102     uint stk_args = 0; // inc by 2 each time
1103 
1104     for (int i = 0; i < total_args_passed; i++) {
1105       switch (sig_bt[i]) {
1106       case T_BOOLEAN:
1107       case T_CHAR:
1108       case T_BYTE:
1109       case T_SHORT:
1110       case T_INT:
1111         if (int_args < Argument::n_int_register_parameters_c) {
1112           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1113 #ifdef _WIN64
1114           fp_args++;
1115           // Allocate slots for callee to stuff register args the stack.
1116           stk_args += 2;
1117 #endif
1118         } else {
1119           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1120           stk_args += 2;
1121         }
1122         break;
1123       case T_LONG:
1124         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1125         // fall through
1126       case T_OBJECT:
1127       case T_ARRAY:
1128       case T_ADDRESS:
1129       case T_METADATA:
1130         if (int_args < Argument::n_int_register_parameters_c) {
1131           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1132 #ifdef _WIN64
1133           fp_args++;
1134           stk_args += 2;
1135 #endif
1136         } else {
1137           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1138           stk_args += 2;
1139         }
1140         break;
1141       case T_FLOAT:
1142         if (fp_args < Argument::n_float_register_parameters_c) {
1143           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1144 #ifdef _WIN64
1145           int_args++;
1146           // Allocate slots for callee to stuff register args the stack.
1147           stk_args += 2;
1148 #endif
1149         } else {
1150           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1151           stk_args += 2;
1152         }
1153         break;
1154       case T_DOUBLE:
1155         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1156         if (fp_args < Argument::n_float_register_parameters_c) {
1157           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1158 #ifdef _WIN64
1159           int_args++;
1160           // Allocate slots for callee to stuff register args the stack.
1161           stk_args += 2;
1162 #endif
1163         } else {
1164           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1165           stk_args += 2;
1166         }
1167         break;
1168       case T_VOID: // Halves of longs and doubles
1169         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1170         regs[i].set_bad();
1171         break;
1172       default:
1173         ShouldNotReachHere();
1174         break;
1175       }
1176     }
1177 #ifdef _WIN64
1178   // windows abi requires that we always allocate enough stack space
1179   // for 4 64bit registers to be stored down.
1180   if (stk_args < 8) {
1181     stk_args = 8;
1182   }
1183 #endif // _WIN64
1184 
1185   return stk_args;
1186 }
1187 
1188 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1189                                              uint num_bits,
1190                                              uint total_args_passed) {
1191   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1192          "only certain vector sizes are supported for now");
1193 
1194   static const XMMRegister VEC_ArgReg[32] = {
1195      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1196      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1197     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1198     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1199   };
1200 
1201   uint stk_args = 0;
1202   uint fp_args = 0;
1203 
1204   for (uint i = 0; i < total_args_passed; i++) {
1205     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1206     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1207     regs[i].set_pair(vmreg->next(next_val), vmreg);
1208   }
1209 
1210   return stk_args;
1211 }
1212 
1213 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1214   // We always ignore the frame_slots arg and just use the space just below frame pointer
1215   // which by this time is free to use
1216   switch (ret_type) {
1217   case T_FLOAT:
1218     __ movflt(Address(rbp, -wordSize), xmm0);
1219     break;
1220   case T_DOUBLE:
1221     __ movdbl(Address(rbp, -wordSize), xmm0);
1222     break;
1223   case T_VOID:  break;
1224   default: {
1225     __ movptr(Address(rbp, -wordSize), rax);
1226     }
1227   }
1228 }
1229 
1230 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1231   // We always ignore the frame_slots arg and just use the space just below frame pointer
1232   // which by this time is free to use
1233   switch (ret_type) {
1234   case T_FLOAT:
1235     __ movflt(xmm0, Address(rbp, -wordSize));
1236     break;
1237   case T_DOUBLE:
1238     __ movdbl(xmm0, Address(rbp, -wordSize));
1239     break;
1240   case T_VOID:  break;
1241   default: {
1242     __ movptr(rax, Address(rbp, -wordSize));
1243     }
1244   }
1245 }
1246 
1247 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1248     for ( int i = first_arg ; i < arg_count ; i++ ) {
1249       if (args[i].first()->is_Register()) {
1250         __ push(args[i].first()->as_Register());
1251       } else if (args[i].first()->is_XMMRegister()) {
1252         __ subptr(rsp, 2*wordSize);
1253         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1254       }
1255     }
1256 }
1257 
1258 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1259     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1260       if (args[i].first()->is_Register()) {
1261         __ pop(args[i].first()->as_Register());
1262       } else if (args[i].first()->is_XMMRegister()) {
1263         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1264         __ addptr(rsp, 2*wordSize);
1265       }
1266     }
1267 }
1268 
1269 static void verify_oop_args(MacroAssembler* masm,
1270                             const methodHandle& method,
1271                             const BasicType* sig_bt,
1272                             const VMRegPair* regs) {
1273   Register temp_reg = rbx;  // not part of any compiled calling seq
1274   if (VerifyOops) {
1275     for (int i = 0; i < method->size_of_parameters(); i++) {
1276       if (is_reference_type(sig_bt[i])) {
1277         VMReg r = regs[i].first();
1278         assert(r->is_valid(), "bad oop arg");
1279         if (r->is_stack()) {
1280           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1281           __ verify_oop(temp_reg);
1282         } else {
1283           __ verify_oop(r->as_Register());
1284         }
1285       }
1286     }
1287   }
1288 }
1289 
1290 static void check_continuation_enter_argument(VMReg actual_vmreg,
1291                                               Register expected_reg,
1292                                               const char* name) {
1293   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1294   assert(actual_vmreg->as_Register() == expected_reg,
1295          "%s is in unexpected register: %s instead of %s",
1296          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1297 }
1298 
1299 
1300 //---------------------------- continuation_enter_setup ---------------------------
1301 //
1302 // Arguments:
1303 //   None.
1304 //
1305 // Results:
1306 //   rsp: pointer to blank ContinuationEntry
1307 //
1308 // Kills:
1309 //   rax
1310 //
1311 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1312   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1313   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1314   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1315 
1316   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1317   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1318 
1319   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1320   OopMap* map = new OopMap(frame_size, 0);
1321 
1322   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1323   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1324   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1325 
1326   return map;
1327 }
1328 
1329 //---------------------------- fill_continuation_entry ---------------------------
1330 //
1331 // Arguments:
1332 //   rsp: pointer to blank Continuation entry
1333 //   reg_cont_obj: pointer to the continuation
1334 //   reg_flags: flags
1335 //
1336 // Results:
1337 //   rsp: pointer to filled out ContinuationEntry
1338 //
1339 // Kills:
1340 //   rax
1341 //
1342 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1343   assert_different_registers(rax, reg_cont_obj, reg_flags);
1344 #ifdef ASSERT
1345   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1346 #endif
1347   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1348   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1349   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1350   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1351   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1352 
1353   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1354   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1355 
1356   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1357 }
1358 
1359 //---------------------------- continuation_enter_cleanup ---------------------------
1360 //
1361 // Arguments:
1362 //   rsp: pointer to the ContinuationEntry
1363 //
1364 // Results:
1365 //   rsp: pointer to the spilled rbp in the entry frame
1366 //
1367 // Kills:
1368 //   rbx
1369 //
1370 static void continuation_enter_cleanup(MacroAssembler* masm) {
1371 #ifdef ASSERT
1372   Label L_good_sp;
1373   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1374   __ jcc(Assembler::equal, L_good_sp);
1375   __ stop("Incorrect rsp at continuation_enter_cleanup");
1376   __ bind(L_good_sp);
1377 #endif
1378   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1379   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1380   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1381   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1382   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1383 }
1384 
1385 static void gen_continuation_enter(MacroAssembler* masm,
1386                                    const VMRegPair* regs,
1387                                    int& exception_offset,
1388                                    OopMapSet* oop_maps,
1389                                    int& frame_complete,
1390                                    int& stack_slots,
1391                                    int& interpreted_entry_offset,
1392                                    int& compiled_entry_offset) {
1393 
1394   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1395   int pos_cont_obj   = 0;
1396   int pos_is_cont    = 1;
1397   int pos_is_virtual = 2;
1398 
1399   // The platform-specific calling convention may present the arguments in various registers.
1400   // To simplify the rest of the code, we expect the arguments to reside at these known
1401   // registers, and we additionally check the placement here in case calling convention ever
1402   // changes.
1403   Register reg_cont_obj   = c_rarg1;
1404   Register reg_is_cont    = c_rarg2;
1405   Register reg_is_virtual = c_rarg3;
1406 
1407   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1408   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1409   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1410 
1411   // Utility methods kill rax, make sure there are no collisions
1412   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1413 
1414   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1415                          relocInfo::static_call_type);
1416 
1417   address start = __ pc();
1418 
1419   Label L_thaw, L_exit;
1420 
1421   // i2i entry used at interp_only_mode only
1422   interpreted_entry_offset = __ pc() - start;
1423   {
1424 #ifdef ASSERT
1425     Label is_interp_only;
1426     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1427     __ jcc(Assembler::notEqual, is_interp_only);
1428     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1429     __ bind(is_interp_only);
1430 #endif
1431 
1432     __ pop(rax); // return address
1433     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1434     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1435     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1436     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1437     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1438     __ push(rax); // return address
1439     __ push_cont_fastpath();
1440 
1441     __ enter();
1442 
1443     stack_slots = 2; // will be adjusted in setup
1444     OopMap* map = continuation_enter_setup(masm, stack_slots);
1445     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1446     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1447 
1448     __ verify_oop(reg_cont_obj);
1449 
1450     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1451 
1452     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1453     __ testptr(reg_is_cont, reg_is_cont);
1454     __ jcc(Assembler::notZero, L_thaw);
1455 
1456     // --- Resolve path
1457 
1458     // Make sure the call is patchable
1459     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1460     // Emit stub for static call
1461     address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1462     if (stub == nullptr) {
1463       fatal("CodeCache is full at gen_continuation_enter");
1464     }
1465     __ call(resolve);
1466     oop_maps->add_gc_map(__ pc() - start, map);
1467     __ post_call_nop();
1468 
1469     __ jmp(L_exit);
1470   }
1471 
1472   // compiled entry
1473   __ align(CodeEntryAlignment);
1474   compiled_entry_offset = __ pc() - start;
1475   __ enter();
1476 
1477   stack_slots = 2; // will be adjusted in setup
1478   OopMap* map = continuation_enter_setup(masm, stack_slots);
1479 
1480   // Frame is now completed as far as size and linkage.
1481   frame_complete = __ pc() - start;
1482 
1483   __ verify_oop(reg_cont_obj);
1484 
1485   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1486 
1487   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1488   __ testptr(reg_is_cont, reg_is_cont);
1489   __ jccb(Assembler::notZero, L_thaw);
1490 
1491   // --- call Continuation.enter(Continuation c, boolean isContinue)
1492 
1493   // Make sure the call is patchable
1494   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1495 
1496   // Emit stub for static call
1497   address stub = CompiledDirectCall::emit_to_interp_stub(masm, __ pc());
1498   if (stub == nullptr) {
1499     fatal("CodeCache is full at gen_continuation_enter");
1500   }
1501 
1502   // The call needs to be resolved. There's a special case for this in
1503   // SharedRuntime::find_callee_info_helper() which calls
1504   // LinkResolver::resolve_continuation_enter() which resolves the call to
1505   // Continuation.enter(Continuation c, boolean isContinue).
1506   __ call(resolve);
1507 
1508   oop_maps->add_gc_map(__ pc() - start, map);
1509   __ post_call_nop();
1510 
1511   __ jmpb(L_exit);
1512 
1513   // --- Thawing path
1514 
1515   __ bind(L_thaw);
1516 
1517   ContinuationEntry::_thaw_call_pc_offset = __ pc() - start;
1518   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1519 
1520   ContinuationEntry::_return_pc_offset = __ pc() - start;
1521   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1522   __ post_call_nop();
1523 
1524   // --- Normal exit (resolve/thawing)
1525 
1526   __ bind(L_exit);
1527   ContinuationEntry::_cleanup_offset = __ pc() - start;
1528   continuation_enter_cleanup(masm);
1529   __ pop(rbp);
1530   __ ret(0);
1531 
1532   // --- Exception handling path
1533 
1534   exception_offset = __ pc() - start;
1535 
1536   continuation_enter_cleanup(masm);
1537   __ pop(rbp);
1538 
1539   __ movptr(c_rarg0, r15_thread);
1540   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1541 
1542   // rax still holds the original exception oop, save it before the call
1543   __ push(rax);
1544 
1545   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1546   __ movptr(rbx, rax);
1547 
1548   // Continue at exception handler:
1549   //   rax: exception oop
1550   //   rbx: exception handler
1551   //   rdx: exception pc
1552   __ pop(rax);
1553   __ verify_oop(rax);
1554   __ pop(rdx);
1555   __ jmp(rbx);
1556 }
1557 
1558 static void gen_continuation_yield(MacroAssembler* masm,
1559                                    const VMRegPair* regs,
1560                                    OopMapSet* oop_maps,
1561                                    int& frame_complete,
1562                                    int& stack_slots,
1563                                    int& compiled_entry_offset) {
1564   enum layout {
1565     rbp_off,
1566     rbpH_off,
1567     return_off,
1568     return_off2,
1569     framesize // inclusive of return address
1570   };
1571   stack_slots = framesize /  VMRegImpl::slots_per_word;
1572   assert(stack_slots == 2, "recheck layout");
1573 
1574   address start = __ pc();
1575   compiled_entry_offset = __ pc() - start;
1576   __ enter();
1577   address the_pc = __ pc();
1578 
1579   frame_complete = the_pc - start;
1580 
1581   // This nop must be exactly at the PC we push into the frame info.
1582   // We use this nop for fast CodeBlob lookup, associate the OopMap
1583   // with it right away.
1584   __ post_call_nop();
1585   OopMap* map = new OopMap(framesize, 1);
1586   oop_maps->add_gc_map(frame_complete, map);
1587 
1588   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1589   __ movptr(c_rarg0, r15_thread);
1590   __ movptr(c_rarg1, rsp);
1591   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1592   __ reset_last_Java_frame(true);
1593 
1594   Label L_pinned;
1595 
1596   __ testptr(rax, rax);
1597   __ jcc(Assembler::notZero, L_pinned);
1598 
1599   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1600   continuation_enter_cleanup(masm);
1601   __ pop(rbp);
1602   __ ret(0);
1603 
1604   __ bind(L_pinned);
1605 
1606   // Pinned, return to caller
1607 
1608   // handle pending exception thrown by freeze
1609   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1610   Label ok;
1611   __ jcc(Assembler::equal, ok);
1612   __ leave();
1613   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1614   __ bind(ok);
1615 
1616   __ leave();
1617   __ ret(0);
1618 }
1619 
1620 void SharedRuntime::continuation_enter_cleanup(MacroAssembler* masm) {
1621   ::continuation_enter_cleanup(masm);
1622 }
1623 
1624 static void gen_special_dispatch(MacroAssembler* masm,
1625                                  const methodHandle& method,
1626                                  const BasicType* sig_bt,
1627                                  const VMRegPair* regs) {
1628   verify_oop_args(masm, method, sig_bt, regs);
1629   vmIntrinsics::ID iid = method->intrinsic_id();
1630 
1631   // Now write the args into the outgoing interpreter space
1632   bool     has_receiver   = false;
1633   Register receiver_reg   = noreg;
1634   int      member_arg_pos = -1;
1635   Register member_reg     = noreg;
1636   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1637   if (ref_kind != 0) {
1638     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1639     member_reg = rbx;  // known to be free at this point
1640     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1641   } else if (iid == vmIntrinsics::_invokeBasic) {
1642     has_receiver = true;
1643   } else if (iid == vmIntrinsics::_linkToNative) {
1644     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1645     member_reg = rbx;  // known to be free at this point
1646   } else {
1647     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1648   }
1649 
1650   if (member_reg != noreg) {
1651     // Load the member_arg into register, if necessary.
1652     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1653     VMReg r = regs[member_arg_pos].first();
1654     if (r->is_stack()) {
1655       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1656     } else {
1657       // no data motion is needed
1658       member_reg = r->as_Register();
1659     }
1660   }
1661 
1662   if (has_receiver) {
1663     // Make sure the receiver is loaded into a register.
1664     assert(method->size_of_parameters() > 0, "oob");
1665     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1666     VMReg r = regs[0].first();
1667     assert(r->is_valid(), "bad receiver arg");
1668     if (r->is_stack()) {
1669       // Porting note:  This assumes that compiled calling conventions always
1670       // pass the receiver oop in a register.  If this is not true on some
1671       // platform, pick a temp and load the receiver from stack.
1672       fatal("receiver always in a register");
1673       receiver_reg = j_rarg0;  // known to be free at this point
1674       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1675     } else {
1676       // no data motion is needed
1677       receiver_reg = r->as_Register();
1678     }
1679   }
1680 
1681   // Figure out which address we are really jumping to:
1682   MethodHandles::generate_method_handle_dispatch(masm, iid,
1683                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1684 }
1685 
1686 // ---------------------------------------------------------------------------
1687 // Generate a native wrapper for a given method.  The method takes arguments
1688 // in the Java compiled code convention, marshals them to the native
1689 // convention (handlizes oops, etc), transitions to native, makes the call,
1690 // returns to java state (possibly blocking), unhandlizes any result and
1691 // returns.
1692 //
1693 // Critical native functions are a shorthand for the use of
1694 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1695 // functions.  The wrapper is expected to unpack the arguments before
1696 // passing them to the callee. Critical native functions leave the state _in_Java,
1697 // since they cannot stop for GC.
1698 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1699 // block and the check for pending exceptions it's impossible for them
1700 // to be thrown.
1701 //
1702 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1703                                                 const methodHandle& method,
1704                                                 int compile_id,
1705                                                 BasicType* in_sig_bt,
1706                                                 VMRegPair* in_regs,
1707                                                 BasicType ret_type) {
1708   if (method->is_continuation_native_intrinsic()) {
1709     int exception_offset = -1;
1710     OopMapSet* oop_maps = new OopMapSet();
1711     int frame_complete = -1;
1712     int stack_slots = -1;
1713     int interpreted_entry_offset = -1;
1714     int vep_offset = -1;
1715     if (method->is_continuation_enter_intrinsic()) {
1716       gen_continuation_enter(masm,
1717                              in_regs,
1718                              exception_offset,
1719                              oop_maps,
1720                              frame_complete,
1721                              stack_slots,
1722                              interpreted_entry_offset,
1723                              vep_offset);
1724     } else if (method->is_continuation_yield_intrinsic()) {
1725       gen_continuation_yield(masm,
1726                              in_regs,
1727                              oop_maps,
1728                              frame_complete,
1729                              stack_slots,
1730                              vep_offset);
1731     } else {
1732       guarantee(false, "Unknown Continuation native intrinsic");
1733     }
1734 
1735 #ifdef ASSERT
1736     if (method->is_continuation_enter_intrinsic()) {
1737       assert(interpreted_entry_offset != -1, "Must be set");
1738       assert(exception_offset != -1,         "Must be set");
1739     } else {
1740       assert(interpreted_entry_offset == -1, "Must be unset");
1741       assert(exception_offset == -1,         "Must be unset");
1742     }
1743     assert(frame_complete != -1,    "Must be set");
1744     assert(stack_slots != -1,       "Must be set");
1745     assert(vep_offset != -1,        "Must be set");
1746 #endif
1747 
1748     __ flush();
1749     nmethod* nm = nmethod::new_native_nmethod(method,
1750                                               compile_id,
1751                                               masm->code(),
1752                                               vep_offset,
1753                                               frame_complete,
1754                                               stack_slots,
1755                                               in_ByteSize(-1),
1756                                               in_ByteSize(-1),
1757                                               oop_maps,
1758                                               exception_offset);
1759     if (nm == nullptr) return nm;
1760     if (method->is_continuation_enter_intrinsic()) {
1761       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1762     } else if (method->is_continuation_yield_intrinsic()) {
1763       _cont_doYield_stub = nm;
1764     }
1765     return nm;
1766   }
1767 
1768   if (method->is_method_handle_intrinsic()) {
1769     vmIntrinsics::ID iid = method->intrinsic_id();
1770     intptr_t start = (intptr_t)__ pc();
1771     int vep_offset = ((intptr_t)__ pc()) - start;
1772     gen_special_dispatch(masm,
1773                          method,
1774                          in_sig_bt,
1775                          in_regs);
1776     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1777     __ flush();
1778     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1779     return nmethod::new_native_nmethod(method,
1780                                        compile_id,
1781                                        masm->code(),
1782                                        vep_offset,
1783                                        frame_complete,
1784                                        stack_slots / VMRegImpl::slots_per_word,
1785                                        in_ByteSize(-1),
1786                                        in_ByteSize(-1),
1787                                        nullptr);
1788   }
1789   address native_func = method->native_function();
1790   assert(native_func != nullptr, "must have function");
1791 
1792   // An OopMap for lock (and class if static)
1793   OopMapSet *oop_maps = new OopMapSet();
1794   intptr_t start = (intptr_t)__ pc();
1795 
1796   // We have received a description of where all the java arg are located
1797   // on entry to the wrapper. We need to convert these args to where
1798   // the jni function will expect them. To figure out where they go
1799   // we convert the java signature to a C signature by inserting
1800   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1801 
1802   const int total_in_args = method->size_of_parameters();
1803   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1804 
1805   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1806   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1807 
1808   int argc = 0;
1809   out_sig_bt[argc++] = T_ADDRESS;
1810   if (method->is_static()) {
1811     out_sig_bt[argc++] = T_OBJECT;
1812   }
1813 
1814   for (int i = 0; i < total_in_args ; i++ ) {
1815     out_sig_bt[argc++] = in_sig_bt[i];
1816   }
1817 
1818   // Now figure out where the args must be stored and how much stack space
1819   // they require.
1820   int out_arg_slots;
1821   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, total_c_args);
1822 
1823   // Compute framesize for the wrapper.  We need to handlize all oops in
1824   // incoming registers
1825 
1826   // Calculate the total number of stack slots we will need.
1827 
1828   // First count the abi requirement plus all of the outgoing args
1829   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1830 
1831   // Now the space for the inbound oop handle area
1832   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1833 
1834   int oop_handle_offset = stack_slots;
1835   stack_slots += total_save_slots;
1836 
1837   // Now any space we need for handlizing a klass if static method
1838 
1839   int klass_slot_offset = 0;
1840   int klass_offset = -1;
1841   int lock_slot_offset = 0;
1842   bool is_static = false;
1843 
1844   if (method->is_static()) {
1845     klass_slot_offset = stack_slots;
1846     stack_slots += VMRegImpl::slots_per_word;
1847     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1848     is_static = true;
1849   }
1850 
1851   // Plus a lock if needed
1852 
1853   if (method->is_synchronized()) {
1854     lock_slot_offset = stack_slots;
1855     stack_slots += VMRegImpl::slots_per_word;
1856   }
1857 
1858   // Now a place (+2) to save return values or temp during shuffling
1859   // + 4 for return address (which we own) and saved rbp
1860   stack_slots += 6;
1861 
1862   // Ok The space we have allocated will look like:
1863   //
1864   //
1865   // FP-> |                     |
1866   //      |---------------------|
1867   //      | 2 slots for moves   |
1868   //      |---------------------|
1869   //      | lock box (if sync)  |
1870   //      |---------------------| <- lock_slot_offset
1871   //      | klass (if static)   |
1872   //      |---------------------| <- klass_slot_offset
1873   //      | oopHandle area      |
1874   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1875   //      | outbound memory     |
1876   //      | based arguments     |
1877   //      |                     |
1878   //      |---------------------|
1879   //      |                     |
1880   // SP-> | out_preserved_slots |
1881   //
1882   //
1883 
1884 
1885   // Now compute actual number of stack words we need rounding to make
1886   // stack properly aligned.
1887   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1888 
1889   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1890 
1891   // First thing make an ic check to see if we should even be here
1892 
1893   // We are free to use all registers as temps without saving them and
1894   // restoring them except rbp. rbp is the only callee save register
1895   // as far as the interpreter and the compiler(s) are concerned.
1896 
1897   const Register receiver = j_rarg0;
1898 
1899   Label exception_pending;
1900 
1901   assert_different_registers(receiver, rscratch1, rscratch2);
1902   __ verify_oop(receiver);
1903   __ ic_check(8 /* end_alignment */);
1904 
1905   int vep_offset = ((intptr_t)__ pc()) - start;
1906 
1907   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1908     Label L_skip_barrier;
1909     Register klass = r10;
1910     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1911     __ clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
1912 
1913     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1914 
1915     __ bind(L_skip_barrier);
1916   }
1917 
1918 #ifdef COMPILER1
1919   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1920   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1921     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1922   }
1923 #endif // COMPILER1
1924 
1925   // The instruction at the verified entry point must be 5 bytes or longer
1926   // because it can be patched on the fly by make_non_entrant. The stack bang
1927   // instruction fits that requirement.
1928 
1929   // Generate stack overflow check
1930   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1931 
1932   // Generate a new frame for the wrapper.
1933   __ enter();
1934   // -2 because return address is already present and so is saved rbp
1935   __ subptr(rsp, stack_size - 2*wordSize);
1936 
1937   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1938   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
1939   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
1940 
1941   // Frame is now completed as far as size and linkage.
1942   int frame_complete = ((intptr_t)__ pc()) - start;
1943 
1944 #ifdef ASSERT
1945   __ check_stack_alignment(rsp, "improperly aligned stack");
1946 #endif /* ASSERT */
1947 
1948 
1949   // We use r14 as the oop handle for the receiver/klass
1950   // It is callee save so it survives the call to native
1951 
1952   const Register oop_handle_reg = r14;
1953 
1954   //
1955   // We immediately shuffle the arguments so that any vm call we have to
1956   // make from here on out (sync slow path, jvmti, etc.) we will have
1957   // captured the oops from our caller and have a valid oopMap for
1958   // them.
1959 
1960   // -----------------
1961   // The Grand Shuffle
1962 
1963   // The Java calling convention is either equal (linux) or denser (win64) than the
1964   // c calling convention. However the because of the jni_env argument the c calling
1965   // convention always has at least one more (and two for static) arguments than Java.
1966   // Therefore if we move the args from java -> c backwards then we will never have
1967   // a register->register conflict and we don't have to build a dependency graph
1968   // and figure out how to break any cycles.
1969   //
1970 
1971   // Record esp-based slot for receiver on stack for non-static methods
1972   int receiver_offset = -1;
1973 
1974   // This is a trick. We double the stack slots so we can claim
1975   // the oops in the caller's frame. Since we are sure to have
1976   // more args than the caller doubling is enough to make
1977   // sure we can capture all the incoming oop args from the
1978   // caller.
1979   //
1980   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1981 
1982   // Mark location of rbp (someday)
1983   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1984 
1985   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1986   // All inbound args are referenced based on rbp and all outbound args via rsp.
1987 
1988 
1989 #ifdef ASSERT
1990   bool reg_destroyed[Register::number_of_registers];
1991   bool freg_destroyed[XMMRegister::number_of_registers];
1992   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
1993     reg_destroyed[r] = false;
1994   }
1995   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
1996     freg_destroyed[f] = false;
1997   }
1998 
1999 #endif /* ASSERT */
2000 
2001   // For JNI natives the incoming and outgoing registers are offset upwards.
2002   GrowableArray<int> arg_order(2 * total_in_args);
2003 
2004   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2005     arg_order.push(i);
2006     arg_order.push(c_arg);
2007   }
2008 
2009   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2010     int i = arg_order.at(ai);
2011     int c_arg = arg_order.at(ai + 1);
2012     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2013 #ifdef ASSERT
2014     if (in_regs[i].first()->is_Register()) {
2015       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2016     } else if (in_regs[i].first()->is_XMMRegister()) {
2017       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2018     }
2019     if (out_regs[c_arg].first()->is_Register()) {
2020       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2021     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2022       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2023     }
2024 #endif /* ASSERT */
2025     switch (in_sig_bt[i]) {
2026       case T_ARRAY:
2027       case T_OBJECT:
2028         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2029                     ((i == 0) && (!is_static)),
2030                     &receiver_offset);
2031         break;
2032       case T_VOID:
2033         break;
2034 
2035       case T_FLOAT:
2036         __ float_move(in_regs[i], out_regs[c_arg]);
2037           break;
2038 
2039       case T_DOUBLE:
2040         assert( i + 1 < total_in_args &&
2041                 in_sig_bt[i + 1] == T_VOID &&
2042                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2043         __ double_move(in_regs[i], out_regs[c_arg]);
2044         break;
2045 
2046       case T_LONG :
2047         __ long_move(in_regs[i], out_regs[c_arg]);
2048         break;
2049 
2050       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2051 
2052       default:
2053         __ move32_64(in_regs[i], out_regs[c_arg]);
2054     }
2055   }
2056 
2057   int c_arg;
2058 
2059   // Pre-load a static method's oop into r14.  Used both by locking code and
2060   // the normal JNI call code.
2061   // point c_arg at the first arg that is already loaded in case we
2062   // need to spill before we call out
2063   c_arg = total_c_args - total_in_args;
2064 
2065   if (method->is_static()) {
2066 
2067     //  load oop into a register
2068     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2069 
2070     // Now handlize the static class mirror it's known not-null.
2071     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2072     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2073 
2074     // Now get the handle
2075     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2076     // store the klass handle as second argument
2077     __ movptr(c_rarg1, oop_handle_reg);
2078     // and protect the arg if we must spill
2079     c_arg--;
2080   }
2081 
2082   // Change state to native (we save the return address in the thread, since it might not
2083   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2084   // points into the right code segment. It does not have to be the correct return pc.
2085   // We use the same pc/oopMap repeatedly when we call out
2086 
2087   Label native_return;
2088   if (method->is_object_wait0()) {
2089     // For convenience we use the pc we want to resume to in case of preemption on Object.wait.
2090     __ set_last_Java_frame(rsp, noreg, native_return, rscratch1);
2091   } else {
2092     intptr_t the_pc = (intptr_t) __ pc();
2093     oop_maps->add_gc_map(the_pc - start, map);
2094 
2095     __ set_last_Java_frame(rsp, noreg, __ pc(), rscratch1);
2096   }
2097 
2098   // We have all of the arguments setup at this point. We must not touch any register
2099   // argument registers at this point (what if we save/restore them there are no oop?
2100 
2101   if (DTraceMethodProbes) {
2102     // protect the args we've loaded
2103     save_args(masm, total_c_args, c_arg, out_regs);
2104     __ mov_metadata(c_rarg1, method());
2105     __ call_VM_leaf(
2106       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2107       r15_thread, c_rarg1);
2108     restore_args(masm, total_c_args, c_arg, out_regs);
2109   }
2110 
2111   // RedefineClasses() tracing support for obsolete method entry
2112   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2113     // protect the args we've loaded
2114     save_args(masm, total_c_args, c_arg, out_regs);
2115     __ mov_metadata(c_rarg1, method());
2116     __ call_VM_leaf(
2117       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2118       r15_thread, c_rarg1);
2119     restore_args(masm, total_c_args, c_arg, out_regs);
2120   }
2121 
2122   // Lock a synchronized method
2123 
2124   // Register definitions used by locking and unlocking
2125 
2126   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2127   const Register obj_reg  = rbx;  // Will contain the oop
2128   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2129 
2130   Label slow_path_lock;
2131   Label lock_done;
2132 
2133   if (method->is_synchronized()) {
2134     // Get the handle (the 2nd argument)
2135     __ mov(oop_handle_reg, c_rarg1);
2136 
2137     // Get address of the box
2138 
2139     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2140 
2141     // Load the oop from the handle
2142     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2143 
2144     __ lightweight_lock(lock_reg, obj_reg, swap_reg, rscratch1, slow_path_lock);
2145 
2146     // Slow path will re-enter here
2147     __ bind(lock_done);
2148   }
2149 
2150   // Finally just about ready to make the JNI call
2151 
2152   // get JNIEnv* which is first argument to native
2153   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2154 
2155   // Now set thread in native
2156   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2157 
2158   __ call(RuntimeAddress(native_func));
2159 
2160   // Verify or restore cpu control state after JNI call
2161   __ restore_cpu_control_state_after_jni(rscratch1);
2162 
2163   // Unpack native results.
2164   switch (ret_type) {
2165   case T_BOOLEAN: __ c2bool(rax);            break;
2166   case T_CHAR   : __ movzwl(rax, rax);      break;
2167   case T_BYTE   : __ sign_extend_byte (rax); break;
2168   case T_SHORT  : __ sign_extend_short(rax); break;
2169   case T_INT    : /* nothing to do */        break;
2170   case T_DOUBLE :
2171   case T_FLOAT  :
2172     // Result is in xmm0 we'll save as needed
2173     break;
2174   case T_ARRAY:                 // Really a handle
2175   case T_OBJECT:                // Really a handle
2176       break; // can't de-handlize until after safepoint check
2177   case T_VOID: break;
2178   case T_LONG: break;
2179   default       : ShouldNotReachHere();
2180   }
2181 
2182   // Switch thread to "native transition" state before reading the synchronization state.
2183   // This additional state is necessary because reading and testing the synchronization
2184   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2185   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2186   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2187   //     Thread A is resumed to finish this native method, but doesn't block here since it
2188   //     didn't see any synchronization is progress, and escapes.
2189   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2190 
2191   // Force this write out before the read below
2192   if (!UseSystemMemoryBarrier) {
2193     __ membar(Assembler::Membar_mask_bits(
2194               Assembler::LoadLoad | Assembler::LoadStore |
2195               Assembler::StoreLoad | Assembler::StoreStore));
2196   }
2197 
2198   // check for safepoint operation in progress and/or pending suspend requests
2199   {
2200     Label Continue;
2201     Label slow_path;
2202 
2203     __ safepoint_poll(slow_path, true /* at_return */, false /* in_nmethod */);
2204 
2205     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2206     __ jcc(Assembler::equal, Continue);
2207     __ bind(slow_path);
2208 
2209     // Don't use call_VM as it will see a possible pending exception and forward it
2210     // and never return here preventing us from clearing _last_native_pc down below.
2211     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2212     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2213     // by hand.
2214     //
2215     __ vzeroupper();
2216     save_native_result(masm, ret_type, stack_slots);
2217     __ mov(c_rarg0, r15_thread);
2218     __ mov(r12, rsp); // remember sp
2219     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2220     __ andptr(rsp, -16); // align stack as required by ABI
2221     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2222     __ mov(rsp, r12); // restore sp
2223     __ reinit_heapbase();
2224     // Restore any method result value
2225     restore_native_result(masm, ret_type, stack_slots);
2226     __ bind(Continue);
2227   }
2228 
2229   // change thread state
2230   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2231 
2232   if (method->is_object_wait0()) {
2233     // Check preemption for Object.wait()
2234     __ movptr(rscratch1, Address(r15_thread, JavaThread::preempt_alternate_return_offset()));
2235     __ cmpptr(rscratch1, NULL_WORD);
2236     __ jccb(Assembler::equal, native_return);
2237     __ movptr(Address(r15_thread, JavaThread::preempt_alternate_return_offset()), NULL_WORD);
2238     __ jmp(rscratch1);
2239     __ bind(native_return);
2240 
2241     intptr_t the_pc = (intptr_t) __ pc();
2242     oop_maps->add_gc_map(the_pc - start, map);
2243   }
2244 
2245 
2246   Label reguard;
2247   Label reguard_done;
2248   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2249   __ jcc(Assembler::equal, reguard);
2250   __ bind(reguard_done);
2251 
2252   // native result if any is live
2253 
2254   // Unlock
2255   Label slow_path_unlock;
2256   Label unlock_done;
2257   if (method->is_synchronized()) {
2258 
2259     Label fast_done;
2260 
2261     // Get locked oop from the handle we passed to jni
2262     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2263 
2264     // Must save rax if it is live now because cmpxchg must use it
2265     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2266       save_native_result(masm, ret_type, stack_slots);
2267     }
2268 
2269     __ lightweight_unlock(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2270 
2271     // slow path re-enters here
2272     __ bind(unlock_done);
2273     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2274       restore_native_result(masm, ret_type, stack_slots);
2275     }
2276 
2277     __ bind(fast_done);
2278   }
2279   if (DTraceMethodProbes) {
2280     save_native_result(masm, ret_type, stack_slots);
2281     __ mov_metadata(c_rarg1, method());
2282     __ call_VM_leaf(
2283          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2284          r15_thread, c_rarg1);
2285     restore_native_result(masm, ret_type, stack_slots);
2286   }
2287 
2288   __ reset_last_Java_frame(false);
2289 
2290   // Unbox oop result, e.g. JNIHandles::resolve value.
2291   if (is_reference_type(ret_type)) {
2292     __ resolve_jobject(rax /* value */,
2293                        rcx /* tmp */);
2294   }
2295 
2296   if (CheckJNICalls) {
2297     // clear_pending_jni_exception_check
2298     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2299   }
2300 
2301   // reset handle block
2302   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2303   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2304 
2305   // pop our frame
2306 
2307   __ leave();
2308 
2309 #if INCLUDE_JFR
2310   // We need to do a poll test after unwind in case the sampler
2311   // managed to sample the native frame after returning to Java.
2312   Label L_return;
2313   address poll_test_pc = __ pc();
2314   __ relocate(relocInfo::poll_return_type);
2315   __ testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
2316   __ jccb(Assembler::zero, L_return);
2317   __ lea(rscratch1, InternalAddress(poll_test_pc));
2318   __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1);
2319   assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
2320     "polling page return stub not created yet");
2321   address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
2322   __ jump(RuntimeAddress(stub));
2323   __ bind(L_return);
2324 #endif // INCLUDE_JFR
2325 
2326   // Any exception pending?
2327   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2328   __ jcc(Assembler::notEqual, exception_pending);
2329 
2330   // Return
2331 
2332   __ ret(0);
2333 
2334   // Unexpected paths are out of line and go here
2335 
2336   // forward the exception
2337   __ bind(exception_pending);
2338 
2339   // and forward the exception
2340   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2341 
2342   // Slow path locking & unlocking
2343   if (method->is_synchronized()) {
2344 
2345     // BEGIN Slow path lock
2346     __ bind(slow_path_lock);
2347 
2348     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2349     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2350 
2351     // protect the args we've loaded
2352     save_args(masm, total_c_args, c_arg, out_regs);
2353 
2354     __ mov(c_rarg0, obj_reg);
2355     __ mov(c_rarg1, lock_reg);
2356     __ mov(c_rarg2, r15_thread);
2357 
2358     // Not a leaf but we have last_Java_frame setup as we want.
2359     // We don't want to unmount in case of contention since that would complicate preserving
2360     // the arguments that had already been marshalled into the native convention. So we force
2361     // the freeze slow path to find this native wrapper frame (see recurse_freeze_native_frame())
2362     // and pin the vthread. Otherwise the fast path won't find it since we don't walk the stack.
2363     __ push_cont_fastpath();
2364     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2365     __ pop_cont_fastpath();
2366     restore_args(masm, total_c_args, c_arg, out_regs);
2367 
2368 #ifdef ASSERT
2369     { Label L;
2370     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2371     __ jcc(Assembler::equal, L);
2372     __ stop("no pending exception allowed on exit from monitorenter");
2373     __ bind(L);
2374     }
2375 #endif
2376     __ jmp(lock_done);
2377 
2378     // END Slow path lock
2379 
2380     // BEGIN Slow path unlock
2381     __ bind(slow_path_unlock);
2382 
2383     // If we haven't already saved the native result we must save it now as xmm registers
2384     // are still exposed.
2385     __ vzeroupper();
2386     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2387       save_native_result(masm, ret_type, stack_slots);
2388     }
2389 
2390     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2391 
2392     __ mov(c_rarg0, obj_reg);
2393     __ mov(c_rarg2, r15_thread);
2394     __ mov(r12, rsp); // remember sp
2395     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2396     __ andptr(rsp, -16); // align stack as required by ABI
2397 
2398     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2399     // NOTE that obj_reg == rbx currently
2400     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2401     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2402 
2403     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2404     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2405     __ mov(rsp, r12); // restore sp
2406     __ reinit_heapbase();
2407 #ifdef ASSERT
2408     {
2409       Label L;
2410       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2411       __ jcc(Assembler::equal, L);
2412       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2413       __ bind(L);
2414     }
2415 #endif /* ASSERT */
2416 
2417     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2418 
2419     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2420       restore_native_result(masm, ret_type, stack_slots);
2421     }
2422     __ jmp(unlock_done);
2423 
2424     // END Slow path unlock
2425 
2426   } // synchronized
2427 
2428   // SLOW PATH Reguard the stack if needed
2429 
2430   __ bind(reguard);
2431   __ vzeroupper();
2432   save_native_result(masm, ret_type, stack_slots);
2433   __ mov(r12, rsp); // remember sp
2434   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2435   __ andptr(rsp, -16); // align stack as required by ABI
2436   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2437   __ mov(rsp, r12); // restore sp
2438   __ reinit_heapbase();
2439   restore_native_result(masm, ret_type, stack_slots);
2440   // and continue
2441   __ jmp(reguard_done);
2442 
2443 
2444 
2445   __ flush();
2446 
2447   nmethod *nm = nmethod::new_native_nmethod(method,
2448                                             compile_id,
2449                                             masm->code(),
2450                                             vep_offset,
2451                                             frame_complete,
2452                                             stack_slots / VMRegImpl::slots_per_word,
2453                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2454                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2455                                             oop_maps);
2456 
2457   return nm;
2458 }
2459 
2460 // this function returns the adjust size (in number of words) to a c2i adapter
2461 // activation for use during deoptimization
2462 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2463   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2464 }
2465 
2466 
2467 uint SharedRuntime::out_preserve_stack_slots() {
2468   return 0;
2469 }
2470 
2471 
2472 // Number of stack slots between incoming argument block and the start of
2473 // a new frame.  The PROLOG must add this many slots to the stack.  The
2474 // EPILOG must remove this many slots.  amd64 needs two slots for
2475 // return address.
2476 uint SharedRuntime::in_preserve_stack_slots() {
2477   return 4 + 2 * VerifyStackAtCalls;
2478 }
2479 
2480 VMReg SharedRuntime::thread_register() {
2481   return r15_thread->as_VMReg();
2482 }
2483 
2484 //------------------------------generate_deopt_blob----------------------------
2485 void SharedRuntime::generate_deopt_blob() {
2486   // Allocate space for the code
2487   ResourceMark rm;
2488   // Setup code generation tools
2489   int pad = 0;
2490   if (UseAVX > 2) {
2491     pad += 1024;
2492   }
2493   if (UseAPX) {
2494     pad += 1024;
2495   }
2496 #if INCLUDE_JVMCI
2497   if (EnableJVMCI) {
2498     pad += 512; // Increase the buffer size when compiling for JVMCI
2499   }
2500 #endif
2501   const char* name = SharedRuntime::stub_name(StubId::shared_deopt_id);
2502   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2503   if (blob != nullptr) {
2504     _deopt_blob = blob->as_deoptimization_blob();
2505     return;
2506   }
2507 
2508   CodeBuffer buffer(name, 2560+pad, 1024);
2509   MacroAssembler* masm = new MacroAssembler(&buffer);
2510   int frame_size_in_words;
2511   OopMap* map = nullptr;
2512   OopMapSet *oop_maps = new OopMapSet();
2513 
2514   // -------------
2515   // This code enters when returning to a de-optimized nmethod.  A return
2516   // address has been pushed on the stack, and return values are in
2517   // registers.
2518   // If we are doing a normal deopt then we were called from the patched
2519   // nmethod from the point we returned to the nmethod. So the return
2520   // address on the stack is wrong by NativeCall::instruction_size
2521   // We will adjust the value so it looks like we have the original return
2522   // address on the stack (like when we eagerly deoptimized).
2523   // In the case of an exception pending when deoptimizing, we enter
2524   // with a return address on the stack that points after the call we patched
2525   // into the exception handler. We have the following register state from,
2526   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2527   //    rax: exception oop
2528   //    rbx: exception handler
2529   //    rdx: throwing pc
2530   // So in this case we simply jam rdx into the useless return address and
2531   // the stack looks just like we want.
2532   //
2533   // At this point we need to de-opt.  We save the argument return
2534   // registers.  We call the first C routine, fetch_unroll_info().  This
2535   // routine captures the return values and returns a structure which
2536   // describes the current frame size and the sizes of all replacement frames.
2537   // The current frame is compiled code and may contain many inlined
2538   // functions, each with their own JVM state.  We pop the current frame, then
2539   // push all the new frames.  Then we call the C routine unpack_frames() to
2540   // populate these frames.  Finally unpack_frames() returns us the new target
2541   // address.  Notice that callee-save registers are BLOWN here; they have
2542   // already been captured in the vframeArray at the time the return PC was
2543   // patched.
2544   address start = __ pc();
2545   Label cont;
2546 
2547   // Prolog for non exception case!
2548 
2549   // Save everything in sight.
2550   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2551 
2552   // Normal deoptimization.  Save exec mode for unpack_frames.
2553   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2554   __ jmp(cont);
2555 
2556   int reexecute_offset = __ pc() - start;
2557 #if INCLUDE_JVMCI && !defined(COMPILER1)
2558   if (UseJVMCICompiler) {
2559     // JVMCI does not use this kind of deoptimization
2560     __ should_not_reach_here();
2561   }
2562 #endif
2563 
2564   // Reexecute case
2565   // return address is the pc describes what bci to do re-execute at
2566 
2567   // No need to update map as each call to save_live_registers will produce identical oopmap
2568   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2569 
2570   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2571   __ jmp(cont);
2572 
2573 #if INCLUDE_JVMCI
2574   Label after_fetch_unroll_info_call;
2575   int implicit_exception_uncommon_trap_offset = 0;
2576   int uncommon_trap_offset = 0;
2577 
2578   if (EnableJVMCI) {
2579     implicit_exception_uncommon_trap_offset = __ pc() - start;
2580 
2581     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2582     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2583 
2584     uncommon_trap_offset = __ pc() - start;
2585 
2586     // Save everything in sight.
2587     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2588     // fetch_unroll_info needs to call last_java_frame()
2589     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2590 
2591     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2592     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2593 
2594     __ movl(r14, Deoptimization::Unpack_reexecute);
2595     __ mov(c_rarg0, r15_thread);
2596     __ movl(c_rarg2, r14); // exec mode
2597     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2598     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2599 
2600     __ reset_last_Java_frame(false);
2601 
2602     __ jmp(after_fetch_unroll_info_call);
2603   } // EnableJVMCI
2604 #endif // INCLUDE_JVMCI
2605 
2606   int exception_offset = __ pc() - start;
2607 
2608   // Prolog for exception case
2609 
2610   // all registers are dead at this entry point, except for rax, and
2611   // rdx which contain the exception oop and exception pc
2612   // respectively.  Set them in TLS and fall thru to the
2613   // unpack_with_exception_in_tls entry point.
2614 
2615   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2616   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2617 
2618   int exception_in_tls_offset = __ pc() - start;
2619 
2620   // new implementation because exception oop is now passed in JavaThread
2621 
2622   // Prolog for exception case
2623   // All registers must be preserved because they might be used by LinearScan
2624   // Exceptiop oop and throwing PC are passed in JavaThread
2625   // tos: stack at point of call to method that threw the exception (i.e. only
2626   // args are on the stack, no return address)
2627 
2628   // make room on stack for the return address
2629   // It will be patched later with the throwing pc. The correct value is not
2630   // available now because loading it from memory would destroy registers.
2631   __ push(0);
2632 
2633   // Save everything in sight.
2634   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2635 
2636   // Now it is safe to overwrite any register
2637 
2638   // Deopt during an exception.  Save exec mode for unpack_frames.
2639   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2640 
2641   // load throwing pc from JavaThread and patch it as the return address
2642   // of the current frame. Then clear the field in JavaThread
2643 
2644   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2645   __ movptr(Address(rbp, wordSize), rdx);
2646   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2647 
2648 #ifdef ASSERT
2649   // verify that there is really an exception oop in JavaThread
2650   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2651   __ verify_oop(rax);
2652 
2653   // verify that there is no pending exception
2654   Label no_pending_exception;
2655   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2656   __ testptr(rax, rax);
2657   __ jcc(Assembler::zero, no_pending_exception);
2658   __ stop("must not have pending exception here");
2659   __ bind(no_pending_exception);
2660 #endif
2661 
2662   __ bind(cont);
2663 
2664   // Call C code.  Need thread and this frame, but NOT official VM entry
2665   // crud.  We cannot block on this call, no GC can happen.
2666   //
2667   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2668 
2669   // fetch_unroll_info needs to call last_java_frame().
2670 
2671   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2672 #ifdef ASSERT
2673   { Label L;
2674     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2675     __ jcc(Assembler::equal, L);
2676     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2677     __ bind(L);
2678   }
2679 #endif // ASSERT
2680   __ mov(c_rarg0, r15_thread);
2681   __ movl(c_rarg1, r14); // exec_mode
2682   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2683 
2684   // Need to have an oopmap that tells fetch_unroll_info where to
2685   // find any register it might need.
2686   oop_maps->add_gc_map(__ pc() - start, map);
2687 
2688   __ reset_last_Java_frame(false);
2689 
2690 #if INCLUDE_JVMCI
2691   if (EnableJVMCI) {
2692     __ bind(after_fetch_unroll_info_call);
2693   }
2694 #endif
2695 
2696   // Load UnrollBlock* into rdi
2697   __ mov(rdi, rax);
2698 
2699   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
2700    Label noException;
2701   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2702   __ jcc(Assembler::notEqual, noException);
2703   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2704   // QQQ this is useless it was null above
2705   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2706   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2707   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2708 
2709   __ verify_oop(rax);
2710 
2711   // Overwrite the result registers with the exception results.
2712   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2713   // I think this is useless
2714   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2715 
2716   __ bind(noException);
2717 
2718   // Only register save data is on the stack.
2719   // Now restore the result registers.  Everything else is either dead
2720   // or captured in the vframeArray.
2721   RegisterSaver::restore_result_registers(masm);
2722 
2723   // All of the register save area has been popped of the stack. Only the
2724   // return address remains.
2725 
2726   // Pop all the frames we must move/replace.
2727   //
2728   // Frame picture (youngest to oldest)
2729   // 1: self-frame (no frame link)
2730   // 2: deopting frame  (no frame link)
2731   // 3: caller of deopting frame (could be compiled/interpreted).
2732   //
2733   // Note: by leaving the return address of self-frame on the stack
2734   // and using the size of frame 2 to adjust the stack
2735   // when we are done the return to frame 3 will still be on the stack.
2736 
2737   // Pop deoptimized frame
2738   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
2739   __ addptr(rsp, rcx);
2740 
2741   // rsp should be pointing at the return address to the caller (3)
2742 
2743   // Pick up the initial fp we should save
2744   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2745   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
2746 
2747 #ifdef ASSERT
2748   // Compilers generate code that bang the stack by as much as the
2749   // interpreter would need. So this stack banging should never
2750   // trigger a fault. Verify that it does not on non product builds.
2751   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
2752   __ bang_stack_size(rbx, rcx);
2753 #endif
2754 
2755   // Load address of array of frame pcs into rcx
2756   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
2757 
2758   // Trash the old pc
2759   __ addptr(rsp, wordSize);
2760 
2761   // Load address of array of frame sizes into rsi
2762   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
2763 
2764   // Load counter into rdx
2765   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
2766 
2767   // Now adjust the caller's stack to make up for the extra locals
2768   // but record the original sp so that we can save it in the skeletal interpreter
2769   // frame and the stack walking of interpreter_sender will get the unextended sp
2770   // value and not the "real" sp value.
2771 
2772   const Register sender_sp = r8;
2773 
2774   __ mov(sender_sp, rsp);
2775   __ movl(rbx, Address(rdi,
2776                        Deoptimization::UnrollBlock::
2777                        caller_adjustment_offset()));
2778   __ subptr(rsp, rbx);
2779 
2780   // Push interpreter frames in a loop
2781   Label loop;
2782   __ bind(loop);
2783   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2784   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2785   __ pushptr(Address(rcx, 0));          // Save return address
2786   __ enter();                           // Save old & set new ebp
2787   __ subptr(rsp, rbx);                  // Prolog
2788   // This value is corrected by layout_activation_impl
2789   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2790   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2791   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2792   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2793   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2794   __ decrementl(rdx);                   // Decrement counter
2795   __ jcc(Assembler::notZero, loop);
2796   __ pushptr(Address(rcx, 0));          // Save final return address
2797 
2798   // Re-push self-frame
2799   __ enter();                           // Save old & set new ebp
2800 
2801   // Allocate a full sized register save area.
2802   // Return address and rbp are in place, so we allocate two less words.
2803   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2804 
2805   // Restore frame locals after moving the frame
2806   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2807   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2808 
2809   // Call C code.  Need thread but NOT official VM entry
2810   // crud.  We cannot block on this call, no GC can happen.  Call should
2811   // restore return values to their stack-slots with the new SP.
2812   //
2813   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2814 
2815   // Use rbp because the frames look interpreted now
2816   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2817   // Don't need the precise return PC here, just precise enough to point into this code blob.
2818   address the_pc = __ pc();
2819   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2820 
2821   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2822   __ mov(c_rarg0, r15_thread);
2823   __ movl(c_rarg1, r14); // second arg: exec_mode
2824   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2825   // Revert SP alignment after call since we're going to do some SP relative addressing below
2826   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2827 
2828   // Set an oopmap for the call site
2829   // Use the same PC we used for the last java frame
2830   oop_maps->add_gc_map(the_pc - start,
2831                        new OopMap( frame_size_in_words, 0 ));
2832 
2833   // Clear fp AND pc
2834   __ reset_last_Java_frame(true);
2835 
2836   // Collect return values
2837   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2838   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2839   // I think this is useless (throwing pc?)
2840   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2841 
2842   // Pop self-frame.
2843   __ leave();                           // Epilog
2844 
2845   // Jump to interpreter
2846   __ ret(0);
2847 
2848   // Make sure all code is generated
2849   masm->flush();
2850 
2851   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2852   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2853 #if INCLUDE_JVMCI
2854   if (EnableJVMCI) {
2855     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2856     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2857   }
2858 #endif
2859 
2860   AOTCodeCache::store_code_blob(*_deopt_blob, AOTCodeEntry::SharedBlob, BlobId::shared_deopt_id);
2861 }
2862 
2863 //------------------------------generate_handler_blob------
2864 //
2865 // Generate a special Compile2Runtime blob that saves all registers,
2866 // and setup oopmap.
2867 //
2868 SafepointBlob* SharedRuntime::generate_handler_blob(StubId id, address call_ptr) {
2869   assert(StubRoutines::forward_exception_entry() != nullptr,
2870          "must be generated before");
2871   assert(is_polling_page_id(id), "expected a polling page stub id");
2872 
2873   // Allocate space for the code.  Setup code generation tools.
2874   const char* name = SharedRuntime::stub_name(id);
2875   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
2876   if (blob != nullptr) {
2877     return blob->as_safepoint_blob();
2878   }
2879 
2880   ResourceMark rm;
2881   OopMapSet *oop_maps = new OopMapSet();
2882   OopMap* map;
2883   CodeBuffer buffer(name, 2548, 1024);
2884   MacroAssembler* masm = new MacroAssembler(&buffer);
2885 
2886   address start   = __ pc();
2887   address call_pc = nullptr;
2888   int frame_size_in_words;
2889   bool cause_return = (id == StubId::shared_polling_page_return_handler_id);
2890   bool save_wide_vectors = (id == StubId::shared_polling_page_vectors_safepoint_handler_id);
2891 
2892   // Make room for return address (or push it again)
2893   if (!cause_return) {
2894     __ push(rbx);
2895   }
2896 
2897   // Save registers, fpu state, and flags
2898   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
2899 
2900   // The following is basically a call_VM.  However, we need the precise
2901   // address of the call in order to generate an oopmap. Hence, we do all the
2902   // work ourselves.
2903 
2904   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
2905 
2906   // The return address must always be correct so that frame constructor never
2907   // sees an invalid pc.
2908 
2909   if (!cause_return) {
2910     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
2911     // Additionally, rbx is a callee saved register and we can look at it later to determine
2912     // if someone changed the return address for us!
2913     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
2914     __ movptr(Address(rbp, wordSize), rbx);
2915   }
2916 
2917   // Do the call
2918   __ mov(c_rarg0, r15_thread);
2919   __ call(RuntimeAddress(call_ptr));
2920 
2921   // Set an oopmap for the call site.  This oopmap will map all
2922   // oop-registers and debug-info registers as callee-saved.  This
2923   // will allow deoptimization at this safepoint to find all possible
2924   // debug-info recordings, as well as let GC find all oops.
2925 
2926   oop_maps->add_gc_map( __ pc() - start, map);
2927 
2928   Label noException;
2929 
2930   __ reset_last_Java_frame(false);
2931 
2932   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
2933   __ jcc(Assembler::equal, noException);
2934 
2935   // Exception pending
2936 
2937   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
2938 
2939   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2940 
2941   // No exception case
2942   __ bind(noException);
2943 
2944   Label no_adjust;
2945 #ifdef ASSERT
2946   Label bail;
2947 #endif
2948   if (!cause_return) {
2949     Label no_prefix, not_special, check_rex_prefix;
2950 
2951     // If our stashed return pc was modified by the runtime we avoid touching it
2952     __ cmpptr(rbx, Address(rbp, wordSize));
2953     __ jcc(Assembler::notEqual, no_adjust);
2954 
2955     // Skip over the poll instruction.
2956     // See NativeInstruction::is_safepoint_poll()
2957     // Possible encodings:
2958     //      85 00       test   %eax,(%rax)
2959     //      85 01       test   %eax,(%rcx)
2960     //      85 02       test   %eax,(%rdx)
2961     //      85 03       test   %eax,(%rbx)
2962     //      85 06       test   %eax,(%rsi)
2963     //      85 07       test   %eax,(%rdi)
2964     //
2965     //   41 85 00       test   %eax,(%r8)
2966     //   41 85 01       test   %eax,(%r9)
2967     //   41 85 02       test   %eax,(%r10)
2968     //   41 85 03       test   %eax,(%r11)
2969     //   41 85 06       test   %eax,(%r14)
2970     //   41 85 07       test   %eax,(%r15)
2971     //
2972     //      85 04 24    test   %eax,(%rsp)
2973     //   41 85 04 24    test   %eax,(%r12)
2974     //      85 45 00    test   %eax,0x0(%rbp)
2975     //   41 85 45 00    test   %eax,0x0(%r13)
2976     //
2977     // Notes:
2978     //  Format of legacy MAP0 test instruction:-
2979     //  [REX/REX2] [OPCODE] [ModRM] [SIB] [DISP] [IMM32]
2980     //  o  For safepoint polling instruction "test %eax,(%rax)", encoding of first register
2981     //     operand and base register of memory operand is b/w [0-8), hence we do not require
2982     //     additional REX prefix where REX.B bit stores MSB bit of register encoding, which
2983     //     is why two bytes encoding is sufficient here.
2984     //  o  For safepoint polling instruction like "test %eax,(%r8)", register encoding of BASE
2985     //     register of memory operand is 1000, thus we need additional REX prefix in this case,
2986     //     there by adding additional byte to instruction encoding.
2987     //  o  In case BASE register is one of the 32 extended GPR registers available only on targets
2988     //     supporting Intel APX extension, then we need to emit two bytes REX2 prefix to hold
2989     //     most significant two bits of 5 bit register encoding.
2990 
2991     if (VM_Version::supports_apx_f()) {
2992       __ cmpb(Address(rbx, 0), Assembler::REX2);
2993       __ jccb(Assembler::notEqual, check_rex_prefix);
2994       __ addptr(rbx, 2);
2995       __ bind(check_rex_prefix);
2996     }
2997     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
2998     __ jccb(Assembler::notEqual, no_prefix);
2999     __ addptr(rbx, 1);
3000     __ bind(no_prefix);
3001 #ifdef ASSERT
3002     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3003 #endif
3004     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3005     // r12/rsp 0x04
3006     // r13/rbp 0x05
3007     __ movzbq(rcx, Address(rbx, 1));
3008     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3009     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3010     __ cmpptr(rcx, 1);
3011     __ jccb(Assembler::above, not_special);
3012     __ addptr(rbx, 1);
3013     __ bind(not_special);
3014 #ifdef ASSERT
3015     // Verify the correct encoding of the poll we're about to skip.
3016     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3017     __ jcc(Assembler::notEqual, bail);
3018     // Mask out the modrm bits
3019     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3020     // rax encodes to 0, so if the bits are nonzero it's incorrect
3021     __ jcc(Assembler::notZero, bail);
3022 #endif
3023     // Adjust return pc forward to step over the safepoint poll instruction
3024     __ addptr(rbx, 2);
3025     __ movptr(Address(rbp, wordSize), rbx);
3026   }
3027 
3028   __ bind(no_adjust);
3029   // Normal exit, restore registers and exit.
3030   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3031   __ ret(0);
3032 
3033 #ifdef ASSERT
3034   __ bind(bail);
3035   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3036 #endif
3037 
3038   // Make sure all code is generated
3039   masm->flush();
3040 
3041   // Fill-out other meta info
3042   SafepointBlob* sp_blob = SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3043 
3044   AOTCodeCache::store_code_blob(*sp_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3045   return sp_blob;
3046 }
3047 
3048 //
3049 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3050 //
3051 // Generate a stub that calls into vm to find out the proper destination
3052 // of a java call. All the argument registers are live at this point
3053 // but since this is generic code we don't know what they are and the caller
3054 // must do any gc of the args.
3055 //
3056 RuntimeStub* SharedRuntime::generate_resolve_blob(StubId id, address destination) {
3057   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3058   assert(is_resolve_id(id), "expected a resolve stub id");
3059 
3060   const char* name = SharedRuntime::stub_name(id);
3061   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3062   if (blob != nullptr) {
3063     return blob->as_runtime_stub();
3064   }
3065 
3066   // allocate space for the code
3067   ResourceMark rm;
3068   CodeBuffer buffer(name, 1552, 512);
3069   MacroAssembler* masm = new MacroAssembler(&buffer);
3070 
3071   int frame_size_in_words;
3072 
3073   OopMapSet *oop_maps = new OopMapSet();
3074   OopMap* map = nullptr;
3075 
3076   int start = __ offset();
3077 
3078   // No need to save vector registers since they are caller-saved anyway.
3079   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3080 
3081   int frame_complete = __ offset();
3082 
3083   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3084 
3085   __ mov(c_rarg0, r15_thread);
3086 
3087   __ call(RuntimeAddress(destination));
3088 
3089 
3090   // Set an oopmap for the call site.
3091   // We need this not only for callee-saved registers, but also for volatile
3092   // registers that the compiler might be keeping live across a safepoint.
3093 
3094   oop_maps->add_gc_map( __ offset() - start, map);
3095 
3096   // rax contains the address we are going to jump to assuming no exception got installed
3097 
3098   // clear last_Java_sp
3099   __ reset_last_Java_frame(false);
3100   // check for pending exceptions
3101   Label pending;
3102   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3103   __ jcc(Assembler::notEqual, pending);
3104 
3105   // get the returned Method*
3106   __ get_vm_result_metadata(rbx);
3107   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3108 
3109   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3110 
3111   RegisterSaver::restore_live_registers(masm);
3112 
3113   // We are back to the original state on entry and ready to go.
3114 
3115   __ jmp(rax);
3116 
3117   // Pending exception after the safepoint
3118 
3119   __ bind(pending);
3120 
3121   RegisterSaver::restore_live_registers(masm);
3122 
3123   // exception pending => remove activation and forward to exception handler
3124 
3125   __ movptr(Address(r15_thread, JavaThread::vm_result_oop_offset()), NULL_WORD);
3126 
3127   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3128   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3129 
3130   // -------------
3131   // make sure all code is generated
3132   masm->flush();
3133 
3134   // return the  blob
3135   // frame_size_words or bytes??
3136   RuntimeStub* rs_blob = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3137 
3138   AOTCodeCache::store_code_blob(*rs_blob, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3139   return rs_blob;
3140 }
3141 
3142 // Continuation point for throwing of implicit exceptions that are
3143 // not handled in the current activation. Fabricates an exception
3144 // oop and initiates normal exception dispatching in this
3145 // frame. Since we need to preserve callee-saved values (currently
3146 // only for C2, but done for C1 as well) we need a callee-saved oop
3147 // map and therefore have to make these stubs into RuntimeStubs
3148 // rather than BufferBlobs.  If the compiler needs all registers to
3149 // be preserved between the fault point and the exception handler
3150 // then it must assume responsibility for that in
3151 // AbstractCompiler::continuation_for_implicit_null_exception or
3152 // continuation_for_implicit_division_by_zero_exception. All other
3153 // implicit exceptions (e.g., NullPointerException or
3154 // AbstractMethodError on entry) are either at call sites or
3155 // otherwise assume that stack unwinding will be initiated, so
3156 // caller saved registers were assumed volatile in the compiler.
3157 RuntimeStub* SharedRuntime::generate_throw_exception(StubId id, address runtime_entry) {
3158   assert(is_throw_id(id), "expected a throw stub id");
3159 
3160   const char* name = SharedRuntime::stub_name(id);
3161 
3162   // Information about frame layout at time of blocking runtime call.
3163   // Note that we only have to preserve callee-saved registers since
3164   // the compilers are responsible for supplying a continuation point
3165   // if they expect all registers to be preserved.
3166   enum layout {
3167     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
3168     rbp_off2,
3169     return_off,
3170     return_off2,
3171     framesize // inclusive of return address
3172   };
3173 
3174   int insts_size = 512;
3175   int locs_size  = 64;
3176 
3177   const char* timer_msg = "SharedRuntime generate_throw_exception";
3178   TraceTime timer(timer_msg, TRACETIME_LOG(Info, startuptime));
3179 
3180   CodeBlob* blob = AOTCodeCache::load_code_blob(AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3181   if (blob != nullptr) {
3182     return blob->as_runtime_stub();
3183   }
3184 
3185   ResourceMark rm;
3186   CodeBuffer code(name, insts_size, locs_size);
3187   OopMapSet* oop_maps  = new OopMapSet();
3188   MacroAssembler* masm = new MacroAssembler(&code);
3189 
3190   address start = __ pc();
3191 
3192   // This is an inlined and slightly modified version of call_VM
3193   // which has the ability to fetch the return PC out of
3194   // thread-local storage and also sets up last_Java_sp slightly
3195   // differently than the real call_VM
3196 
3197   __ enter(); // required for proper stackwalking of RuntimeStub frame
3198 
3199   assert(is_even(framesize/2), "sp not 16-byte aligned");
3200 
3201   // return address and rbp are already in place
3202   __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
3203 
3204   int frame_complete = __ pc() - start;
3205 
3206   // Set up last_Java_sp and last_Java_fp
3207   address the_pc = __ pc();
3208   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3209   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3210 
3211   // Call runtime
3212   __ movptr(c_rarg0, r15_thread);
3213   BLOCK_COMMENT("call runtime_entry");
3214   __ call(RuntimeAddress(runtime_entry));
3215 
3216   // Generate oop map
3217   OopMap* map = new OopMap(framesize, 0);
3218 
3219   oop_maps->add_gc_map(the_pc - start, map);
3220 
3221   __ reset_last_Java_frame(true);
3222 
3223   __ leave(); // required for proper stackwalking of RuntimeStub frame
3224 
3225   // check for pending exceptions
3226 #ifdef ASSERT
3227   Label L;
3228   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3229   __ jcc(Assembler::notEqual, L);
3230   __ should_not_reach_here();
3231   __ bind(L);
3232 #endif // ASSERT
3233   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3234 
3235 
3236   // codeBlob framesize is in words (not VMRegImpl::slot_size)
3237   RuntimeStub* stub =
3238     RuntimeStub::new_runtime_stub(name,
3239                                   &code,
3240                                   frame_complete,
3241                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3242                                   oop_maps, false);
3243   AOTCodeCache::store_code_blob(*stub, AOTCodeEntry::SharedBlob, StubInfo::blob(id));
3244 
3245   return stub;
3246 }
3247 
3248 //------------------------------Montgomery multiplication------------------------
3249 //
3250 
3251 #ifndef _WINDOWS
3252 
3253 // Subtract 0:b from carry:a.  Return carry.
3254 static julong
3255 sub(julong a[], julong b[], julong carry, long len) {
3256   long long i = 0, cnt = len;
3257   julong tmp;
3258   asm volatile("clc; "
3259                "0: ; "
3260                "mov (%[b], %[i], 8), %[tmp]; "
3261                "sbb %[tmp], (%[a], %[i], 8); "
3262                "inc %[i]; dec %[cnt]; "
3263                "jne 0b; "
3264                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3265                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3266                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3267                : "memory");
3268   return tmp;
3269 }
3270 
3271 // Multiply (unsigned) Long A by Long B, accumulating the double-
3272 // length result into the accumulator formed of T0, T1, and T2.
3273 #define MACC(A, B, T0, T1, T2)                                  \
3274 do {                                                            \
3275   unsigned long hi, lo;                                         \
3276   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3277            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3278            : "r"(A), "a"(B) : "cc");                            \
3279  } while(0)
3280 
3281 // As above, but add twice the double-length result into the
3282 // accumulator.
3283 #define MACC2(A, B, T0, T1, T2)                                 \
3284 do {                                                            \
3285   unsigned long hi, lo;                                         \
3286   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3287            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3288            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3289            : "r"(A), "a"(B) : "cc");                            \
3290  } while(0)
3291 
3292 #else //_WINDOWS
3293 
3294 static julong
3295 sub(julong a[], julong b[], julong carry, long len) {
3296   long i;
3297   julong tmp;
3298   unsigned char c = 1;
3299   for (i = 0; i < len; i++) {
3300     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3301     a[i] = tmp;
3302   }
3303   c = _addcarry_u64(c, carry, ~0, &tmp);
3304   return tmp;
3305 }
3306 
3307 // Multiply (unsigned) Long A by Long B, accumulating the double-
3308 // length result into the accumulator formed of T0, T1, and T2.
3309 #define MACC(A, B, T0, T1, T2)                          \
3310 do {                                                    \
3311   julong hi, lo;                            \
3312   lo = _umul128(A, B, &hi);                             \
3313   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3314   c = _addcarry_u64(c, hi, T1, &T1);                    \
3315   _addcarry_u64(c, T2, 0, &T2);                         \
3316  } while(0)
3317 
3318 // As above, but add twice the double-length result into the
3319 // accumulator.
3320 #define MACC2(A, B, T0, T1, T2)                         \
3321 do {                                                    \
3322   julong hi, lo;                            \
3323   lo = _umul128(A, B, &hi);                             \
3324   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3325   c = _addcarry_u64(c, hi, T1, &T1);                    \
3326   _addcarry_u64(c, T2, 0, &T2);                         \
3327   c = _addcarry_u64(0, lo, T0, &T0);                    \
3328   c = _addcarry_u64(c, hi, T1, &T1);                    \
3329   _addcarry_u64(c, T2, 0, &T2);                         \
3330  } while(0)
3331 
3332 #endif //_WINDOWS
3333 
3334 // Fast Montgomery multiplication.  The derivation of the algorithm is
3335 // in  A Cryptographic Library for the Motorola DSP56000,
3336 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3337 
3338 static void NOINLINE
3339 montgomery_multiply(julong a[], julong b[], julong n[],
3340                     julong m[], julong inv, int len) {
3341   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3342   int i;
3343 
3344   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3345 
3346   for (i = 0; i < len; i++) {
3347     int j;
3348     for (j = 0; j < i; j++) {
3349       MACC(a[j], b[i-j], t0, t1, t2);
3350       MACC(m[j], n[i-j], t0, t1, t2);
3351     }
3352     MACC(a[i], b[0], t0, t1, t2);
3353     m[i] = t0 * inv;
3354     MACC(m[i], n[0], t0, t1, t2);
3355 
3356     assert(t0 == 0, "broken Montgomery multiply");
3357 
3358     t0 = t1; t1 = t2; t2 = 0;
3359   }
3360 
3361   for (i = len; i < 2*len; i++) {
3362     int j;
3363     for (j = i-len+1; j < len; j++) {
3364       MACC(a[j], b[i-j], t0, t1, t2);
3365       MACC(m[j], n[i-j], t0, t1, t2);
3366     }
3367     m[i-len] = t0;
3368     t0 = t1; t1 = t2; t2 = 0;
3369   }
3370 
3371   while (t0)
3372     t0 = sub(m, n, t0, len);
3373 }
3374 
3375 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3376 // multiplies so it should be up to 25% faster than Montgomery
3377 // multiplication.  However, its loop control is more complex and it
3378 // may actually run slower on some machines.
3379 
3380 static void NOINLINE
3381 montgomery_square(julong a[], julong n[],
3382                   julong m[], julong inv, int len) {
3383   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3384   int i;
3385 
3386   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3387 
3388   for (i = 0; i < len; i++) {
3389     int j;
3390     int end = (i+1)/2;
3391     for (j = 0; j < end; j++) {
3392       MACC2(a[j], a[i-j], t0, t1, t2);
3393       MACC(m[j], n[i-j], t0, t1, t2);
3394     }
3395     if ((i & 1) == 0) {
3396       MACC(a[j], a[j], t0, t1, t2);
3397     }
3398     for (; j < i; j++) {
3399       MACC(m[j], n[i-j], t0, t1, t2);
3400     }
3401     m[i] = t0 * inv;
3402     MACC(m[i], n[0], t0, t1, t2);
3403 
3404     assert(t0 == 0, "broken Montgomery square");
3405 
3406     t0 = t1; t1 = t2; t2 = 0;
3407   }
3408 
3409   for (i = len; i < 2*len; i++) {
3410     int start = i-len+1;
3411     int end = start + (len - start)/2;
3412     int j;
3413     for (j = start; j < end; j++) {
3414       MACC2(a[j], a[i-j], t0, t1, t2);
3415       MACC(m[j], n[i-j], t0, t1, t2);
3416     }
3417     if ((i & 1) == 0) {
3418       MACC(a[j], a[j], t0, t1, t2);
3419     }
3420     for (; j < len; j++) {
3421       MACC(m[j], n[i-j], t0, t1, t2);
3422     }
3423     m[i-len] = t0;
3424     t0 = t1; t1 = t2; t2 = 0;
3425   }
3426 
3427   while (t0)
3428     t0 = sub(m, n, t0, len);
3429 }
3430 
3431 // Swap words in a longword.
3432 static julong swap(julong x) {
3433   return (x << 32) | (x >> 32);
3434 }
3435 
3436 // Copy len longwords from s to d, word-swapping as we go.  The
3437 // destination array is reversed.
3438 static void reverse_words(julong *s, julong *d, int len) {
3439   d += len;
3440   while(len-- > 0) {
3441     d--;
3442     *d = swap(*s);
3443     s++;
3444   }
3445 }
3446 
3447 // The threshold at which squaring is advantageous was determined
3448 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3449 #define MONTGOMERY_SQUARING_THRESHOLD 64
3450 
3451 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3452                                         jint len, jlong inv,
3453                                         jint *m_ints) {
3454   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3455   int longwords = len/2;
3456 
3457   // Make very sure we don't use so much space that the stack might
3458   // overflow.  512 jints corresponds to an 16384-bit integer and
3459   // will use here a total of 8k bytes of stack space.
3460   int divisor = sizeof(julong) * 4;
3461   guarantee(longwords <= 8192 / divisor, "must be");
3462   int total_allocation = longwords * sizeof (julong) * 4;
3463   julong *scratch = (julong *)alloca(total_allocation);
3464 
3465   // Local scratch arrays
3466   julong
3467     *a = scratch + 0 * longwords,
3468     *b = scratch + 1 * longwords,
3469     *n = scratch + 2 * longwords,
3470     *m = scratch + 3 * longwords;
3471 
3472   reverse_words((julong *)a_ints, a, longwords);
3473   reverse_words((julong *)b_ints, b, longwords);
3474   reverse_words((julong *)n_ints, n, longwords);
3475 
3476   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3477 
3478   reverse_words(m, (julong *)m_ints, longwords);
3479 }
3480 
3481 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3482                                       jint len, jlong inv,
3483                                       jint *m_ints) {
3484   assert(len % 2 == 0, "array length in montgomery_square must be even");
3485   int longwords = len/2;
3486 
3487   // Make very sure we don't use so much space that the stack might
3488   // overflow.  512 jints corresponds to an 16384-bit integer and
3489   // will use here a total of 6k bytes of stack space.
3490   int divisor = sizeof(julong) * 3;
3491   guarantee(longwords <= (8192 / divisor), "must be");
3492   int total_allocation = longwords * sizeof (julong) * 3;
3493   julong *scratch = (julong *)alloca(total_allocation);
3494 
3495   // Local scratch arrays
3496   julong
3497     *a = scratch + 0 * longwords,
3498     *n = scratch + 1 * longwords,
3499     *m = scratch + 2 * longwords;
3500 
3501   reverse_words((julong *)a_ints, a, longwords);
3502   reverse_words((julong *)n_ints, n, longwords);
3503 
3504   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3505     ::montgomery_square(a, n, m, (julong)inv, longwords);
3506   } else {
3507     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3508   }
3509 
3510   reverse_words(m, (julong *)m_ints, longwords);
3511 }
3512 
3513 #if INCLUDE_JFR
3514 
3515 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
3516 // It returns a jobject handle to the event writer.
3517 // The handle is dereferenced and the return value is the event writer oop.
3518 RuntimeStub* SharedRuntime::generate_jfr_write_checkpoint() {
3519   enum layout {
3520     rbp_off,
3521     rbpH_off,
3522     return_off,
3523     return_off2,
3524     framesize // inclusive of return address
3525   };
3526 
3527   const char* name = SharedRuntime::stub_name(StubId::shared_jfr_write_checkpoint_id);
3528   CodeBuffer code(name, 1024, 64);
3529   MacroAssembler* masm = new MacroAssembler(&code);
3530   address start = __ pc();
3531 
3532   __ enter();
3533   address the_pc = __ pc();
3534 
3535   int frame_complete = the_pc - start;
3536 
3537   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
3538   __ movptr(c_rarg0, r15_thread);
3539   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
3540   __ reset_last_Java_frame(true);
3541 
3542   // rax is jobject handle result, unpack and process it through a barrier.
3543   __ resolve_global_jobject(rax, c_rarg0);
3544 
3545   __ leave();
3546   __ ret(0);
3547 
3548   OopMapSet* oop_maps = new OopMapSet();
3549   OopMap* map = new OopMap(framesize, 1);
3550   oop_maps->add_gc_map(frame_complete, map);
3551 
3552   RuntimeStub* stub =
3553     RuntimeStub::new_runtime_stub(name,
3554                                   &code,
3555                                   frame_complete,
3556                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3557                                   oop_maps,
3558                                   false);
3559   return stub;
3560 }
3561 
3562 // For c2: call to return a leased buffer.
3563 RuntimeStub* SharedRuntime::generate_jfr_return_lease() {
3564   enum layout {
3565     rbp_off,
3566     rbpH_off,
3567     return_off,
3568     return_off2,
3569     framesize // inclusive of return address
3570   };
3571 
3572   const char* name = SharedRuntime::stub_name(StubId::shared_jfr_return_lease_id);
3573   CodeBuffer code(name, 1024, 64);
3574   MacroAssembler* masm = new MacroAssembler(&code);
3575   address start = __ pc();
3576 
3577   __ enter();
3578   address the_pc = __ pc();
3579 
3580   int frame_complete = the_pc - start;
3581 
3582   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch2);
3583   __ movptr(c_rarg0, r15_thread);
3584   __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
3585   __ reset_last_Java_frame(true);
3586 
3587   __ leave();
3588   __ ret(0);
3589 
3590   OopMapSet* oop_maps = new OopMapSet();
3591   OopMap* map = new OopMap(framesize, 1);
3592   oop_maps->add_gc_map(frame_complete, map);
3593 
3594   RuntimeStub* stub =
3595     RuntimeStub::new_runtime_stub(name,
3596                                   &code,
3597                                   frame_complete,
3598                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3599                                   oop_maps,
3600                                   false);
3601   return stub;
3602 }
3603 
3604 #endif // INCLUDE_JFR
3605